[Feature] Basic utils to handle raw data features (#2102)

* add feature utils and add test for feature norm * Add docstring and test * upd * dis able some test * Update * update doc string * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-68-185.ec2.internal>

[Feature] Basic utils to handle raw data features (#2102)
* add feature utils and add test for feature norm * Add docstring and test * upd * dis able some test * Update * update doc string * update Co-authored-by: Ubuntu <ubuntu@ip-172-31-68-185.ec2.internal>
33a8bb93 · xiang song(charlie.song) · GitHub · b2ac89f2 · 33a8bb93 · 33a8bb93
Unverified Commit 33a8bb93 authored Aug 27, 2020 by xiang song(charlie.song) Committed by GitHub Aug 27, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 752 additions and 5 deletions

python/dgl/data/utils.py python/dgl/data/utils.py +423 -3

tests/compute/test_data.py tests/compute/test_data.py +329 -2

No files found.
--- a/python/dgl/data/utils.py
+++ b/python/dgl/data/utils.py
@@ -8,10 +8,18 @@ import warnings
 import requests
 import pickle
 import errno
+from multiprocessing import Manager,Process
+
 import numpy as np
+import scipy.sparse as sp
+
+try:
+    import spacy
+    from sklearn.preprocessing import LabelBinarizer
+    from sklearn.preprocessing import MultiLabelBinarizer
+except ImportError:
+    pass

-import pickle
-import errno

 from .graph_serialize import save_graphs, load_graphs, load_labels
 from .tensor_serialize import save_tensors, load_tensors
@@ -20,7 +28,10 @@ from .. import backend as F

 __all__ = ['loadtxt','download', 'check_sha1', 'extract_archive',
           'get_download_dir', 'Subset', 'split_dataset',
-           'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"]
+           'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors",
+           'parse_word2vec_feature', 'parse_category_single_feat',
+           'parse_category_multi_feat', 'parse_numerical_feat',
+           'parse_numerical_multihot_feat']

 def loadtxt(path, delimiter, dtype=None):
    try:
@@ -350,3 +361,412 @@ class Subset(object):
            Number of datapoints in the subset
        """
        return len(self.indices)
+
+################### Feature Processing #######################
+
+def row_normalize(features):
+    mx = sp.csr_matrix(features, dtype=np.float32)
+
+    """Row-normalize sparse matrix"""
+    rowsum = np.array(mx.sum(1))
+    r_inv = np.power(rowsum, -1).flatten()
+    r_inv[np.isinf(r_inv)] = 0.
+    r_mat_inv = sp.diags(r_inv)
+    mx = r_mat_inv.dot(mx)
+    return np.array(mx.todense())
+
+def col_normalize(features):
+    mx = sp.csr_matrix(features, dtype=np.float32)
+
+    colsum = np.array(mx.sum(0))
+    c_inv = np.power(colsum, -1).flatten()
+    c_inv[np.isinf(c_inv)] = 0.
+    c_mat_inv = sp.diags(c_inv).transpose()
+    mx = mx.dot(c_mat_inv)
+    return np.array(mx.todense())
+
+def float_row_l1_normalize(features):
+    rowsum = np.sum(np.abs(features), axis=1)
+    r_inv = np.power(rowsum, -1).reshape(-1,1)
+    r_inv[np.isinf(r_inv)] = 0.
+    return features * r_inv
+
+def float_col_l1_normalize(features):
+    colsum = np.sum(np.abs(features), axis=0)
+    c_inv = np.power(colsum, -1)
+    c_inv[np.isinf(c_inv)] = 0.
+    return features * c_inv
+
+def float_col_maxmin_normalize(features):
+    feats = np.transpose(features)
+    min_val = np.reshape(np.amin(feats, axis=1), (-1, 1))
+    max_val = np.reshape(np.amax(feats, axis=1), (-1, 1))
+    norm = (feats - min_val) / (max_val - min_val)
+    norm[np.isnan(norm)] = 0.
+    return np.transpose(norm)
+
+def embed_word2vec(str_val, nlps):
+    """ Use NLP encoder to encode the string into vector
+
+    There can be multiple NLP encoders in nlps. Each encoder
+    is invoded to generate a embedding for the input string and
+    the resulting embeddings are concatenated.
+
+    Parameters
+    ----------
+    str_val : str
+        words to encode
+
+    nlps : list of func
+        a list of nlp encoder functions
+    """
+    vector = None
+    for nlp in nlps:
+        doc = nlp(str_val)
+        if vector is None:
+            vector = doc.vector
+        else:
+            vector = np.concatenate((vector, doc.vector))
+    return vector
+
+def parse_lang_feat(str_feats, nlp_encoders, verbose=False):
+    """ Parse a list of strings using word2vec encoding using NLP encoders in nlps
+
+    Parameters
+    ----------
+    str_feats : list of str
+        list of strings to encode
+
+    nlp_encoders : list of func
+        a list of nlp encoder functions
+
+    verbose : bool, optional
+        print out debug info
+        Default: False
+
+    Return
+    ------
+    numpy.array
+        the encoded features
+    """
+    features = []
+    num_feats = len(str_feats)
+    num_process = num_feats if num_feats < 8 else 8 # TODO(xiangsx) get system nproc
+    batch_size = (num_feats + num_process - 1) // num_process
+
+    def embed_lang(d, proc_idx, feats):
+        res_feats = []
+        for s_feat in feats:
+            res_feats.append(embed_word2vec(s_feat, nlp_encoders))
+        d[proc_idx] = res_feats
+
+    # use multi process to process the feature
+    manager = Manager()
+    d = manager.dict()
+    job=[]
+    for i in range(num_process):
+        sub_info = str_feats[i * batch_size : (i+1) * batch_size \
+                         if (i+1) * batch_size < num_feats else num_feats]
+        job.append(Process(target=embed_lang, args=(d, i, sub_info)))
+
+    for p in job:
+        p.start()
+
+    for p in job:
+        p.join()
+
+    for i in range(num_process):
+        if len(d[i]) > 0:
+            features.append(d[i])
+
+    features = np.concatenate(features)
+    if verbose:
+        print(features.shape)
+
+    return features
+
+def parse_word2vec_feature(str_feats, languages, verbose=False):
+    """ Parse a list of strings using word2vec encoding using NLP encoders in nlps
+
+    Parameters
+    ----------
+    str_feats : list of str
+        list of strings to encode
+
+    languages : list of string
+        list of languages used to encode the feature string.
+
+    verbose : bool, optional
+        print out debug info
+        Default: False
+
+    Return
+    ------
+    numpy.array
+        the encoded features
+
+    Examples
+    --------
+
+    >>> inputs = ['hello', 'world']
+    >>> languages = ['en_core_web_lg', 'fr_core_news_lg']
+    >>> feats = parse_word2vec_node_feature(inputs, languages)
+
+    """
+    import spacy
+
+    nlp_encoders = []
+    for lang in languages:
+        encoder = spacy.load(lang)
+        nlp_encoders.append(encoder)
+
+    return parse_lang_feat(str_feats, nlp_encoders, verbose)
+
+def parse_category_single_feat(category_inputs, norm=None):
+    """ Parse categorical features and convert it into onehot encoding.
+
+    Each entity of category_inputs should only contain only one category.
+
+    Parameters
+    ----------
+    category_inputs : list of str
+        input categorical features
+    norm: str, optional
+        Which kind of normalization is applied to the features.
+        Supported normalization ops include:
+
+        (1) None, do nothing.
+        (2) `col`, column-based normalization. Normalize the data
+        for each column:
+
+        .. math::
+            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
+
+        (3) `row`, sane as None
+
+    Note
+    ----
+    sklearn.preprocessing.LabelBinarizer is used to convert
+    categorical features into a onehot encoding format.
+
+    Return
+    ------
+    numpy.array
+        The features in numpy array
+
+    Examples
+    --------
+
+    >>> inputs = ['A', 'B', 'C', 'A']
+    >>> feats = parse_category_single_feat(inputs)
+    >>> feats
+        array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
+
+    """
+    from sklearn.preprocessing import LabelBinarizer
+    lb = LabelBinarizer()
+    feat = lb.fit_transform(category_inputs)
+
+    # if there are only 2 catebories,
+    # fit_transform only create a array of [0, 1, ...]
+    if feat.shape[1] == 1:
+        f = np.zeros((feat.shape[0], 2))
+        f[range(f.shape[0]),feat.squeeze()] = 1.
+        feat = f
+
+    if norm == 'col':
+        return col_normalize(feat)
+    else:
+        return feat
+
+def parse_category_multi_feat(category_inputs, norm=None):
+    """ Parse categorical features and convert it into multi-hot encoding.
+
+    Each entity of category_inputs may contain multiple categorical labels.
+    It uses multi-hot encoding to encode these labels.
+
+    Parameters
+    ----------
+    category_inputs : list of list of str
+        input categorical features
+    norm: str, optional
+        Which kind of normalization is applied to the features.
+        Supported normalization ops include:
+
+        (1) None, do nothing.
+        (2) `col`, column-based normalization. Normalize the data
+        for each column:
+
+        .. math::
+            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
+
+        (3) `row`, row-based normalization. Normalize the data for
+        each row:
+
+        .. math::
+            x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
+
+        Default: None
+
+    Note
+    ----
+    sklearn.preprocessing.MultiLabelBinarizer is used to convert
+    categorical features into a multilabel format.
+
+    Return
+    ------
+    numpy.array
+        The features in numpy array
+
+    Example
+    -------
+
+    >>> inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
+    >>> feats = parse_category_multi_feat(inputs)
+    >>> feats
+        array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
+
+    """
+    from sklearn.preprocessing import MultiLabelBinarizer
+    mlb = MultiLabelBinarizer()
+    feat = mlb.fit_transform(category_inputs)
+
+    if norm == 'col':
+        return col_normalize(feat)
+    if norm == 'row':
+        return row_normalize(feat)
+    else:
+        return feat
+
+def parse_numerical_feat(numerical_inputs, norm=None):
+    """ Parse numerical features.
+
+    Parameters
+    ----------
+    numerical_inputs : list of float or list of list of float
+        input numerical features
+    norm: str, optional
+        Which kind of normalization is applied to the features.
+        Supported normalization ops include:
+
+        (1) None, do nothing.
+        (2) `standard`:, column-based normalization. Normalize the data
+        for each column:
+
+        .. math::
+            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{|x_{ij}|}}
+
+        (3) `min-max`: column-based min-max normalization. Normalize the data
+        for each column:
+
+        .. math::
+            norm_i = \frac{x_i - min(x[:])}{max(x[:])-min(x[:])}
+
+
+    Return
+    ------
+    numpy.array
+        The features in numpy array
+
+    Example
+
+    >>> inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
+    >>> feat = parse_numerical_feat(inputs, norm='col')
+    >>> feat
+    array([[0.25, 0., 0.],[0.5, 0.33333333, 0.25],[0.25, 0.66666667, -0.75]])
+
+    """
+    feat = np.array(numerical_inputs, dtype='float')
+
+    if norm == 'standard':
+        return float_col_l1_normalize(feat)
+    elif norm == 'min-max':
+        return float_col_maxmin_normalize(feat)
+    else:
+        return feat
+
+def parse_numerical_multihot_feat(input_feats, low, high, bucket_cnt, window_size, norm=None):
+    r""" Parse numerical features by matching them into
+        different buckets.
+
+    A bucket range based algorithm is used to convert numerical value into multi-hop
+    encoding features.
+
+    A numerical value range [low, high) is defined, and it is
+    divied into #bucket_cnt buckets. For a input V, we get its effected range as
+    [V - window_size/2, V + window_size/2] and check how many buckets it covers in
+    [low, high).
+
+    Parameters
+    ----------
+    input_feats : list of float
+        Input numerical features
+    low : float
+        Lower bound of the range of the numerical values.
+        All v_i < low will be set to v_i = low.
+    high : float
+        Upper bound of the range of the numerical values.
+        All v_j > high will be set to v_j = high.
+    bucket_cnt: int
+        Number of bucket to use.
+    slide_window_size: int
+        The sliding window used to convert numerical value into bucket number.
+    norm: str, optional
+        Which kind of normalization is applied to the features.
+        Supported normalization ops include:
+
+        (1) None, do nothing.
+        (2) `col`, column-based normalization. Normalize the data
+        for each column:
+
+        .. math::
+            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
+
+        (3) `row`, row-based normalization. Normalize the data for
+        each row:
+
+        .. math::
+            x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
+
+    Example
+    -------
+
+    >>> inputs = [0., 15., 26., 40.]
+    >>> low = 10.
+    >>> high = 30.
+    >>> bucket_cnt = 4
+    >>> window_size = 10. # range is 10 ~ 15; 15 ~ 20; 20 ~ 25; 25 ~ 30
+    >>> feat = parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
+    >>> feat
+        array([[1., 0., 0., 0],
+               [1., 1., 1., 0.],
+               [0., 0., 1., 1.],
+               [0., 0., 0., 1.]])
+    """
+    raw_feats = np.array(input_feats, dtype=np.float32)
+    num_nodes = raw_feats.shape[0]
+    feat = np.zeros((num_nodes, bucket_cnt), dtype=np.float32)
+
+    bucket_size = (high - low) / bucket_cnt
+    eposilon = bucket_size / 10
+    low_val = raw_feats - window_size/2
+    high_val = raw_feats + window_size/2
+    low_val[low_val < low] = low
+    high_val[high_val < low] = low
+    high_val[high_val >= high] = high - eposilon
+    low_val[low_val >= high] = high - eposilon
+    low_val -= low
+    high_val -= low
+    low_idx = (low_val / bucket_size).astype('int')
+    high_idx = (high_val / bucket_size).astype('int') + 1
+
+    for i in range(raw_feats.shape[0]):
+        idx = np.arange(start=low_idx[i], stop=high_idx[i])
+        feat[i][idx] = 1.
+
+    if norm == 'col':
+        return col_normalize(feat)
+    if norm == 'row':
+        return row_normalize(feat)
+    else:
+        return feat
--- a/tests/compute/test_data.py
+++ b/tests/compute/test_data.py
 import dgl.data as data
+import unittest, pytest
+import numpy as np

 def test_minigc():
    ds = data.MiniGCDataset(16, 10, 20)
@@ -18,6 +20,331 @@ def test_data_hash():
    assert a.hash == b.hash
    assert a.hash != c.hash

+def test_row_normalize():
+    features = np.array([[1., 1., 1.]])
+    row_norm_feat = data.utils.row_normalize(features)
+    assert np.allclose(np.array([1./3., 1./3., 1./3.]), row_norm_feat)
+
+    features = np.array([[1.], [1.], [1.]])
+    row_norm_feat = data.utils.row_normalize(features)
+    assert np.allclose(np.array([[1.], [1.], [1.]]), row_norm_feat)
+
+    features = np.array([[1., 0., 0.],[0., 1., 1.],[0., 0., 0.]])
+    row_norm_feat = data.utils.row_normalize(features)
+    assert np.allclose(np.array([[1., 0., 0.],[0., 0.5, 0.5],[0., 0., 0.]]),
+                       row_norm_feat)
+
+    # input (2, 3)
+    features = np.array([[1., 0., 0.],[2., 1., 1.]])
+    row_norm_feat = data.utils.row_normalize(features)
+    assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25]]),
+                       row_norm_feat)
+
+    # input (3, 2)
+    features = np.array([[1., 0.],[1., 1.],[0., 0.]])
+    row_norm_feat = data.utils.row_normalize(features)
+    assert np.allclose(np.array([[1., 0.],[0.5, 0.5],[0., 0.]]),
+                       row_norm_feat)
+
+def test_col_normalize():
+    features = np.array([[1., 1., 1.]])
+    col_norm_feat = data.utils.col_normalize(features)
+    assert np.allclose(np.array([[1., 1., 1.]]), col_norm_feat)
+
+    features = np.array([[1.], [1.], [1.]])
+    row_norm_feat = data.utils.col_normalize(features)
+    assert np.allclose(np.array([[1./3.],[1./3.], [1./3.]]), row_norm_feat)
+
+    features = np.array([[1., 0., 0.],[1., 1., 0.],[0., 0., 0.]])
+    col_norm_feat = data.utils.col_normalize(features)
+    assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.],[0., 0., 0.]]),
+                       col_norm_feat)
+
+    # input (2. 3)
+    features = np.array([[1., 0., 0.],[1., 1., 0.]])
+    col_norm_feat = data.utils.col_normalize(features)
+    assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.]]),
+                       col_norm_feat)
+
+    # input (3. 2)
+    features = np.array([[1., 0.],[1., 1.],[2., 0.]])
+    col_norm_feat = data.utils.col_normalize(features)
+    assert np.allclose(np.array([[0.25, 0.],[0.25, 1.0],[0.5, 0.]]),
+                       col_norm_feat)
+
+def test_float_row_normalize():
+    features = np.array([[1.],[2.],[-3.]])
+    row_norm_feat = data.utils.float_row_l1_normalize(features)
+    assert np.allclose(np.array([[1.],[1.],[-1.]]), row_norm_feat)
+
+    features = np.array([[1., 2., -3.]])
+    row_norm_feat = data.utils.float_row_l1_normalize(features)
+    assert np.allclose(np.array([[1./6., 2./6., -3./6.]]), row_norm_feat)
+
+    features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
+    row_norm_feat = data.utils.float_row_l1_normalize(features)
+    assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25],[1./6., 2./6., -3./6.]]),
+                       row_norm_feat)
+
+     # input (2 3)
+    features = np.array([[1., 0., 0.],[-2., 1., 1.]])
+    row_norm_feat = data.utils.float_row_l1_normalize(features)
+    assert np.allclose(np.array([[1., 0., 0.],[-0.5, 0.25, 0.25]]),
+                       row_norm_feat)
+
+     # input (3, 2)
+    features = np.array([[1., 0.],[-2., 1.],[1., 2.]])
+    row_norm_feat = data.utils.float_row_l1_normalize(features)
+    assert np.allclose(np.array([[1., 0.],[-2./3., 1./3.],[1./3., 2./3.]]),
+                       row_norm_feat)
+
+def test_float_col_normalize():
+    features = np.array([[1., 2., -3.]])
+    col_norm_feat = data.utils.float_col_l1_normalize(features)
+    assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
+
+    features = np.array([[1.], [2.], [-3.]])
+    col_norm_feat = data.utils.float_col_l1_normalize(features)
+    assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
+
+    features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
+    col_norm_feat = data.utils.float_col_l1_normalize(features)
+    assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
+                       col_norm_feat)
+
+    # input (2. 3)
+    features = np.array([[1., 0., 0.],[2., 1., -1.]])
+    col_norm_feat = data.utils.float_col_l1_normalize(features)
+    assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
+                       col_norm_feat)
+
+    # input (3. 2)
+    features = np.array([[1., 0.],[2., 1.],[1., -2.]])
+    col_norm_feat = data.utils.float_col_l1_normalize(features)
+    assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
+                       col_norm_feat)
+
+def test_float_col_maxmin_normalize():
+    features = np.array([[1., 2., -3.]])
+    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
+    assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
+
+    features = np.array([[1.], [2.], [-3.]])
+    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
+    assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
+
+    features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
+    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
+    assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
+                       col_norm_feat)
+
+    # input (2. 3)
+    features = np.array([[1., 0., 0.],[2., 1., -1.]])
+    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
+    assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
+                       col_norm_feat)
+
+    # input (3. 2)
+    features = np.array([[1., 0.],[2., 1.],[4., -2.]])
+    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
+    assert np.allclose(np.array([[0., 2./3.],[1./3., 1.],[1., 0.]]),
+                       col_norm_feat)
+
+@unittest.skip("spacy language test is too heavy")
+def test_embed_word2vec():
+    import spacy
+
+    inputs = ['hello', 'world']
+    languages = ['en_core_web_lg', 'fr_core_news_lg']
+    nlps = [spacy.load(languages[0])]
+
+    feats = data.utils.embed_word2vec(inputs[0], nlps)
+    doc = nlps[0](inputs[0])
+    assert np.allclose(doc.vector, feats)
+
+    nlps.append(spacy.load(languages[1]))
+    for input in inputs:
+        feats = data.utils.embed_word2vec(input, nlps)
+        doc0 = nlps[0](input)
+        doc1 = nlps[1](input)
+        assert np.allclose(np.concatenate((doc0.vector, doc1.vector)),
+                           feats)
+
+@unittest.skip("spacy language test is too heavy")
+def test_parse_lang_feat():
+    import spacy
+
+    inputs = ['hello', 'world']
+    languages = ['en_core_web_lg', 'fr_core_news_lg']
+    nlps = [spacy.load(languages[0]), spacy.load(languages[1])]
+    feats = data.utils.parse_lang_feat(inputs, nlps)
+
+    res_feats = []
+    for input in inputs:
+        doc0 = nlps[0](input)
+        doc1 = nlps[1](input)
+        res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
+    res_feats = np.stack(res_feats)
+    assert np.allclose(feats, res_feats)
+
+    inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
+    feats = data.utils.parse_lang_feat(inputs, nlps)
+
+    res_feats = []
+    for input in inputs:
+        doc0 = nlps[0](input)
+        doc1 = nlps[1](input)
+        res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
+    res_feats = np.stack(res_feats)
+    assert np.allclose(feats, res_feats)
+
+    inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
+    feats = data.utils.parse_word2vec_feature(inputs, languages)
+
+    res_feats = []
+    for input in inputs:
+        doc0 = nlps[0](input)
+        doc1 = nlps[1](input)
+        res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
+    res_feats = np.stack(res_feats)
+    assert np.allclose(feats, res_feats)
+
+@unittest.skip("LabelBinarizer and MultiLabelBinarizer is not included in CI env")
+def test_parse_category_feat():
+    # single-hot
+    inputs = ['A', 'B']
+    feats = data.utils.parse_category_single_feat(inputs)
+    assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
+
+    inputs = ['A', 'B', 'C', 'A']
+    feats = data.utils.parse_category_single_feat(inputs)
+    assert np.allclose(np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
+    # col norm
+    feats = data.utils.parse_category_single_feat(inputs, norm='col')
+    assert np.allclose(np.array([[.5,0.,0.],[0.,1.,0.],[0.,0.,1.],[.5,0.,0.]]), feats)
+
+    # multi-hot
+    inputs = [['A'], ['B']]
+    feats = data.utils.parse_category_multi_feat(inputs)
+    assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
+
+    inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
+    feats = data.utils.parse_category_multi_feat(inputs)
+    assert np.allclose(np.array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
+    # row norm
+    feats = data.utils.parse_category_multi_feat(inputs, norm='row')
+    assert np.allclose(np.array([[1./3.,1./3.,1./3.],[.5,.5,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
+    # col norm
+    feats = data.utils.parse_category_multi_feat(inputs, norm='col')
+    assert np.allclose(np.array([[1./3.,0.5,0.5],[1./3.,0.5,0.],[0.,0.,0.5],[1./3.,0.,0.]]), feats)
+
+def test_parse_numerical_feat():
+    inputs = [[1., 2., -3.]]
+    feat = data.utils.parse_numerical_feat(inputs)
+    assert np.allclose(inputs, feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
+    assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
+    assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
+
+    inputs = [[1.], [2.], [-3.]]
+    feat = data.utils.parse_numerical_feat(inputs)
+    assert np.allclose(inputs, feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
+    assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
+    assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
+
+    inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
+    feat = data.utils.parse_numerical_feat(inputs)
+    assert np.allclose(inputs, feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
+    assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
+                       col_norm_feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
+    assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
+                       col_norm_feat)
+
+    # input (2. 3)
+    inputs = [[1., 0., 0.],[2., 1., -1.]]
+    feat = data.utils.parse_numerical_feat(inputs)
+    assert np.allclose(inputs, feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
+    assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
+                       col_norm_feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
+    assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
+                       col_norm_feat)
+
+    # input (3. 2)
+    inputs = [[1., 0.],[2., 1.],[1., -2.]]
+    feat = data.utils.parse_numerical_feat(inputs)
+    assert np.allclose(inputs, feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
+    assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
+                       col_norm_feat)
+    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
+    assert np.allclose(np.array([[0., 2./3.],[1., 1.],[0., 0.]]),
+                       col_norm_feat)
+
+def test_parse_numerical_multihot_feat():
+    inputs = [0., 15., 20., 10.1, 25., 40.]
+    low = 10.
+    high = 30.
+    bucket_cnt = 2 #10~20, 20~30
+    window_size = 0.
+    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
+    assert np.allclose(np.array([[1., 0.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.]]), feat)
+
+    inputs = [0., 5., 15., 20., 10.1, 25., 30.1, 40.]
+    low = 10.
+    high = 30.
+    bucket_cnt = 4 #10~15,15~20,20~25,25~30
+    window_size = 10.
+    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
+    assert np.allclose(np.array([[1., 0., 0., 0],
+                                 [1., 0., 0., 0],
+                                 [1., 1., 1., 0.],
+                                 [0., 1., 1., 1.],
+                                 [1., 1., 0., 0.],
+                                 [0., 0., 1., 1.],
+                                 [0., 0., 0., 1.],
+                                 [0., 0., 0., 1.]]), feat)
+
+    # col norm
+    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='col')
+    assert np.allclose(np.array([[1./4., 0.,    0.,    0],
+                                 [1./4., 0.,    0.,    0],
+                                 [1./4., 1./3., 1./3., 0.],
+                                 [0.,    1./3., 1./3., 1./4.],
+                                 [1./4., 1./3., 0.,    0.],
+                                 [0.,    0.,    1./3., 1./4.],
+                                 [0.,    0.,    0.,    1./4.],
+                                 [0.,    0.,    0.,    1./4.]]), feat)
+
+    # row norm
+    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='row')
+    assert np.allclose(np.array([[1., 0., 0., 0],
+                                 [1., 0., 0., 0],
+                                 [1./3., 1./3., 1./3., 0.],
+                                 [0., 1./3., 1./3., 1./3.],
+                                 [1./2., 1./2., 0., 0.],
+                                 [0., 0., 1./2., 1./2.],
+                                 [0., 0., 0., 1.],
+                                 [0., 0., 0., 1.]]), feat)
+
 if __name__ == '__main__':
-    test_minigc()
-    test_data_hash()
+    #test_minigc()
+    #test_data_hash()
+
+    test_row_normalize()
+    test_col_normalize()
+    test_float_row_normalize()
+    test_float_col_normalize()
+    test_float_col_maxmin_normalize()
+    #test_embed_word2vec()
+
+    #test_parse_lang_feat()
+    #test_parse_category_feat()
+    test_parse_numerical_feat()
+    test_parse_numerical_multihot_feat()
\ No newline at end of file