Revert "[Feature] Basic utils to handle raw data features (#2102)" (#2147)

This reverts commit 33a8bb93 . Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>

Revert "[Feature] Basic utils to handle raw data features (#2102)" (#2147)
This reverts commit 33a8bb93 . Co-authored-by: Minjie Wang <wmjlyjemaine@gmail.com>
c9c6171b · xiang song(charlie.song) · GitHub · 3659fb05 · c9c6171b · c9c6171b
Unverified Commit c9c6171b authored Sep 08, 2020 by xiang song(charlie.song) Committed by GitHub Sep 08, 2020
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 752 deletions

python/dgl/data/utils.py python/dgl/data/utils.py +3 -423

tests/compute/test_data.py tests/compute/test_data.py +2 -329

No files found.
--- a/python/dgl/data/utils.py
+++ b/python/dgl/data/utils.py
@@ -8,18 +8,10 @@ import warnings
 import requests
 import pickle
 import errno
-from multiprocessing import Manager,Process
 import numpy as np
-import scipy.sparse as sp
-try:
-    import spacy
-    from sklearn.preprocessing import LabelBinarizer
-    from sklearn.preprocessing import MultiLabelBinarizer
-except ImportError:
-    pass
+import pickle
+import errno
 from .graph_serialize import save_graphs, load_graphs, load_labels
 from .tensor_serialize import save_tensors, load_tensors
@@ -28,10 +20,7 @@ from .. import backend as F
 __all__ = ['loadtxt','download', 'check_sha1', 'extract_archive',
           'get_download_dir', 'Subset', 'split_dataset',
-           'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors",
+           'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"]
-           'parse_word2vec_feature', 'parse_category_single_feat',
-           'parse_category_multi_feat', 'parse_numerical_feat',
-           'parse_numerical_multihot_feat']
 def loadtxt(path, delimiter, dtype=None):
    try:
@@ -361,412 +350,3 @@ class Subset(object):
            Number of datapoints in the subset
        """
        return len(self.indices)
-################### Feature Processing #######################
-def row_normalize(features):
-    mx = sp.csr_matrix(features, dtype=np.float32)
-    """Row-normalize sparse matrix"""
-    rowsum = np.array(mx.sum(1))
-    r_inv = np.power(rowsum, -1).flatten()
-    r_inv[np.isinf(r_inv)] = 0.
-    r_mat_inv = sp.diags(r_inv)
-    mx = r_mat_inv.dot(mx)
-    return np.array(mx.todense())
-def col_normalize(features):
-    mx = sp.csr_matrix(features, dtype=np.float32)
-    colsum = np.array(mx.sum(0))
-    c_inv = np.power(colsum, -1).flatten()
-    c_inv[np.isinf(c_inv)] = 0.
-    c_mat_inv = sp.diags(c_inv).transpose()
-    mx = mx.dot(c_mat_inv)
-    return np.array(mx.todense())
-def float_row_l1_normalize(features):
-    rowsum = np.sum(np.abs(features), axis=1)
-    r_inv = np.power(rowsum, -1).reshape(-1,1)
-    r_inv[np.isinf(r_inv)] = 0.
-    return features * r_inv
-def float_col_l1_normalize(features):
-    colsum = np.sum(np.abs(features), axis=0)
-    c_inv = np.power(colsum, -1)
-    c_inv[np.isinf(c_inv)] = 0.
-    return features * c_inv
-def float_col_maxmin_normalize(features):
-    feats = np.transpose(features)
-    min_val = np.reshape(np.amin(feats, axis=1), (-1, 1))
-    max_val = np.reshape(np.amax(feats, axis=1), (-1, 1))
-    norm = (feats - min_val) / (max_val - min_val)
-    norm[np.isnan(norm)] = 0.
-    return np.transpose(norm)
-def embed_word2vec(str_val, nlps):
-    """ Use NLP encoder to encode the string into vector
-    There can be multiple NLP encoders in nlps. Each encoder
-    is invoded to generate a embedding for the input string and
-    the resulting embeddings are concatenated.
-    Parameters
-    ----------
-    str_val : str
-        words to encode
-    nlps : list of func
-        a list of nlp encoder functions
-    """
-    vector = None
-    for nlp in nlps:
-        doc = nlp(str_val)
-        if vector is None:
-            vector = doc.vector
-        else:
-            vector = np.concatenate((vector, doc.vector))
-    return vector
-def parse_lang_feat(str_feats, nlp_encoders, verbose=False):
-    """ Parse a list of strings using word2vec encoding using NLP encoders in nlps
-    Parameters
-    ----------
-    str_feats : list of str
-        list of strings to encode
-    nlp_encoders : list of func
-        a list of nlp encoder functions
-    verbose : bool, optional
-        print out debug info
-        Default: False
-    Return
-    ------
-    numpy.array
-        the encoded features
-    """
-    features = []
-    num_feats = len(str_feats)
-    num_process = num_feats if num_feats < 8 else 8 # TODO(xiangsx) get system nproc
-    batch_size = (num_feats + num_process - 1) // num_process
-    def embed_lang(d, proc_idx, feats):
-        res_feats = []
-        for s_feat in feats:
-            res_feats.append(embed_word2vec(s_feat, nlp_encoders))
-        d[proc_idx] = res_feats
-    # use multi process to process the feature
-    manager = Manager()
-    d = manager.dict()
-    job=[]
-    for i in range(num_process):
-        sub_info = str_feats[i * batch_size : (i+1) * batch_size \
-                         if (i+1) * batch_size < num_feats else num_feats]
-        job.append(Process(target=embed_lang, args=(d, i, sub_info)))
-    for p in job:
-        p.start()
-    for p in job:
-        p.join()
-    for i in range(num_process):
-        if len(d[i]) > 0:
-            features.append(d[i])
-    features = np.concatenate(features)
-    if verbose:
-        print(features.shape)
-    return features
-def parse_word2vec_feature(str_feats, languages, verbose=False):
-    """ Parse a list of strings using word2vec encoding using NLP encoders in nlps
-    Parameters
-    ----------
-    str_feats : list of str
-        list of strings to encode
-    languages : list of string
-        list of languages used to encode the feature string.
-    verbose : bool, optional
-        print out debug info
-        Default: False
-    Return
-    ------
-    numpy.array
-        the encoded features
-    Examples
-    --------
-    >>> inputs = ['hello', 'world']
-    >>> languages = ['en_core_web_lg', 'fr_core_news_lg']
-    >>> feats = parse_word2vec_node_feature(inputs, languages)
-    """
-    import spacy
-    nlp_encoders = []
-    for lang in languages:
-        encoder = spacy.load(lang)
-        nlp_encoders.append(encoder)
-    return parse_lang_feat(str_feats, nlp_encoders, verbose)
-def parse_category_single_feat(category_inputs, norm=None):
-    """ Parse categorical features and convert it into onehot encoding.
-    Each entity of category_inputs should only contain only one category.
-    Parameters
-    ----------
-    category_inputs : list of str
-        input categorical features
-    norm: str, optional
-        Which kind of normalization is applied to the features.
-        Supported normalization ops include:
-        (1) None, do nothing.
-        (2) `col`, column-based normalization. Normalize the data
-        for each column:
-        .. math::
-            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
-        (3) `row`, sane as None
-    Note
-    ----
-    sklearn.preprocessing.LabelBinarizer is used to convert
-    categorical features into a onehot encoding format.
-    Return
-    ------
-    numpy.array
-        The features in numpy array
-    Examples
-    --------
-    >>> inputs = ['A', 'B', 'C', 'A']
-    >>> feats = parse_category_single_feat(inputs)
-    >>> feats
-        array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
-    """
-    from sklearn.preprocessing import LabelBinarizer
-    lb = LabelBinarizer()
-    feat = lb.fit_transform(category_inputs)
-    # if there are only 2 catebories,
-    # fit_transform only create a array of [0, 1, ...]
-    if feat.shape[1] == 1:
-        f = np.zeros((feat.shape[0], 2))
-        f[range(f.shape[0]),feat.squeeze()] = 1.
-        feat = f
-    if norm == 'col':
-        return col_normalize(feat)
-    else:
-        return feat
-def parse_category_multi_feat(category_inputs, norm=None):
-    """ Parse categorical features and convert it into multi-hot encoding.
-    Each entity of category_inputs may contain multiple categorical labels.
-    It uses multi-hot encoding to encode these labels.
-    Parameters
-    ----------
-    category_inputs : list of list of str
-        input categorical features
-    norm: str, optional
-        Which kind of normalization is applied to the features.
-        Supported normalization ops include:
-        (1) None, do nothing.
-        (2) `col`, column-based normalization. Normalize the data
-        for each column:
-        .. math::
-            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
-        (3) `row`, row-based normalization. Normalize the data for
-        each row:
-        .. math::
-            x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
-        Default: None
-    Note
-    ----
-    sklearn.preprocessing.MultiLabelBinarizer is used to convert
-    categorical features into a multilabel format.
-    Return
-    ------
-    numpy.array
-        The features in numpy array
-    Example
-    -------
-    >>> inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
-    >>> feats = parse_category_multi_feat(inputs)
-    >>> feats
-        array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
-    """
-    from sklearn.preprocessing import MultiLabelBinarizer
-    mlb = MultiLabelBinarizer()
-    feat = mlb.fit_transform(category_inputs)
-    if norm == 'col':
-        return col_normalize(feat)
-    if norm == 'row':
-        return row_normalize(feat)
-    else:
-        return feat
-def parse_numerical_feat(numerical_inputs, norm=None):
-    """ Parse numerical features.
-    Parameters
-    ----------
-    numerical_inputs : list of float or list of list of float
-        input numerical features
-    norm: str, optional
-        Which kind of normalization is applied to the features.
-        Supported normalization ops include:
-        (1) None, do nothing.
-        (2) `standard`:, column-based normalization. Normalize the data
-        for each column:
-        .. math::
-            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{|x_{ij}|}}
-        (3) `min-max`: column-based min-max normalization. Normalize the data
-        for each column:
-        .. math::
-            norm_i = \frac{x_i - min(x[:])}{max(x[:])-min(x[:])}
-    Return
-    ------
-    numpy.array
-        The features in numpy array
-    Example
-    >>> inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
-    >>> feat = parse_numerical_feat(inputs, norm='col')
-    >>> feat
-    array([[0.25, 0., 0.],[0.5, 0.33333333, 0.25],[0.25, 0.66666667, -0.75]])
-    """
-    feat = np.array(numerical_inputs, dtype='float')
-    if norm == 'standard':
-        return float_col_l1_normalize(feat)
-    elif norm == 'min-max':
-        return float_col_maxmin_normalize(feat)
-    else:
-        return feat
-def parse_numerical_multihot_feat(input_feats, low, high, bucket_cnt, window_size, norm=None):
-    r""" Parse numerical features by matching them into
-        different buckets.
-    A bucket range based algorithm is used to convert numerical value into multi-hop
-    encoding features.
-    A numerical value range [low, high) is defined, and it is
-    divied into #bucket_cnt buckets. For a input V, we get its effected range as
-    [V - window_size/2, V + window_size/2] and check how many buckets it covers in
-    [low, high).
-    Parameters
-    ----------
-    input_feats : list of float
-        Input numerical features
-    low : float
-        Lower bound of the range of the numerical values.
-        All v_i < low will be set to v_i = low.
-    high : float
-        Upper bound of the range of the numerical values.
-        All v_j > high will be set to v_j = high.
-    bucket_cnt: int
-        Number of bucket to use.
-    slide_window_size: int
-        The sliding window used to convert numerical value into bucket number.
-    norm: str, optional
-        Which kind of normalization is applied to the features.
-        Supported normalization ops include:
-        (1) None, do nothing.
-        (2) `col`, column-based normalization. Normalize the data
-        for each column:
-        .. math::
-            x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
-        (3) `row`, row-based normalization. Normalize the data for
-        each row:
-        .. math::
-            x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
-    Example
-    -------
-    >>> inputs = [0., 15., 26., 40.]
-    >>> low = 10.
-    >>> high = 30.
-    >>> bucket_cnt = 4
-    >>> window_size = 10. # range is 10 ~ 15; 15 ~ 20; 20 ~ 25; 25 ~ 30
-    >>> feat = parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
-    >>> feat
-        array([[1., 0., 0., 0],
-               [1., 1., 1., 0.],
-               [0., 0., 1., 1.],
-               [0., 0., 0., 1.]])
-    """
-    raw_feats = np.array(input_feats, dtype=np.float32)
-    num_nodes = raw_feats.shape[0]
-    feat = np.zeros((num_nodes, bucket_cnt), dtype=np.float32)
-    bucket_size = (high - low) / bucket_cnt
-    eposilon = bucket_size / 10
-    low_val = raw_feats - window_size/2
-    high_val = raw_feats + window_size/2
-    low_val[low_val < low] = low
-    high_val[high_val < low] = low
-    high_val[high_val >= high] = high - eposilon
-    low_val[low_val >= high] = high - eposilon
-    low_val -= low
-    high_val -= low
-    low_idx = (low_val / bucket_size).astype('int')
-    high_idx = (high_val / bucket_size).astype('int') + 1
-    for i in range(raw_feats.shape[0]):
-        idx = np.arange(start=low_idx[i], stop=high_idx[i])
-        feat[i][idx] = 1.
-    if norm == 'col':
-        return col_normalize(feat)
-    if norm == 'row':
-        return row_normalize(feat)
-    else:
-        return feat
--- a/tests/compute/test_data.py
+++ b/tests/compute/test_data.py
 import dgl.data as data
-import unittest, pytest
-import numpy as np
 def test_minigc():
    ds = data.MiniGCDataset(16, 10, 20)
@@ -20,331 +18,6 @@ def test_data_hash():
    assert a.hash == b.hash
    assert a.hash != c.hash
-def test_row_normalize():
-    features = np.array([[1., 1., 1.]])
-    row_norm_feat = data.utils.row_normalize(features)
-    assert np.allclose(np.array([1./3., 1./3., 1./3.]), row_norm_feat)
-    features = np.array([[1.], [1.], [1.]])
-    row_norm_feat = data.utils.row_normalize(features)
-    assert np.allclose(np.array([[1.], [1.], [1.]]), row_norm_feat)
-    features = np.array([[1., 0., 0.],[0., 1., 1.],[0., 0., 0.]])
-    row_norm_feat = data.utils.row_normalize(features)
-    assert np.allclose(np.array([[1., 0., 0.],[0., 0.5, 0.5],[0., 0., 0.]]),
-                       row_norm_feat)
-    # input (2, 3)
-    features = np.array([[1., 0., 0.],[2., 1., 1.]])
-    row_norm_feat = data.utils.row_normalize(features)
-    assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25]]),
-                       row_norm_feat)
-    # input (3, 2)
-    features = np.array([[1., 0.],[1., 1.],[0., 0.]])
-    row_norm_feat = data.utils.row_normalize(features)
-    assert np.allclose(np.array([[1., 0.],[0.5, 0.5],[0., 0.]]),
-                       row_norm_feat)
-def test_col_normalize():
-    features = np.array([[1., 1., 1.]])
-    col_norm_feat = data.utils.col_normalize(features)
-    assert np.allclose(np.array([[1., 1., 1.]]), col_norm_feat)
-    features = np.array([[1.], [1.], [1.]])
-    row_norm_feat = data.utils.col_normalize(features)
-    assert np.allclose(np.array([[1./3.],[1./3.], [1./3.]]), row_norm_feat)
-    features = np.array([[1., 0., 0.],[1., 1., 0.],[0., 0., 0.]])
-    col_norm_feat = data.utils.col_normalize(features)
-    assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.],[0., 0., 0.]]),
-                       col_norm_feat)
-    # input (2. 3)
-    features = np.array([[1., 0., 0.],[1., 1., 0.]])
-    col_norm_feat = data.utils.col_normalize(features)
-    assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.]]),
-                       col_norm_feat)
-    # input (3. 2)
-    features = np.array([[1., 0.],[1., 1.],[2., 0.]])
-    col_norm_feat = data.utils.col_normalize(features)
-    assert np.allclose(np.array([[0.25, 0.],[0.25, 1.0],[0.5, 0.]]),
-                       col_norm_feat)
-def test_float_row_normalize():
-    features = np.array([[1.],[2.],[-3.]])
-    row_norm_feat = data.utils.float_row_l1_normalize(features)
-    assert np.allclose(np.array([[1.],[1.],[-1.]]), row_norm_feat)
-    features = np.array([[1., 2., -3.]])
-    row_norm_feat = data.utils.float_row_l1_normalize(features)
-    assert np.allclose(np.array([[1./6., 2./6., -3./6.]]), row_norm_feat)
-    features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
-    row_norm_feat = data.utils.float_row_l1_normalize(features)
-    assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25],[1./6., 2./6., -3./6.]]),
-                       row_norm_feat)
-     # input (2 3)
-    features = np.array([[1., 0., 0.],[-2., 1., 1.]])
-    row_norm_feat = data.utils.float_row_l1_normalize(features)
-    assert np.allclose(np.array([[1., 0., 0.],[-0.5, 0.25, 0.25]]),
-                       row_norm_feat)
-     # input (3, 2)
-    features = np.array([[1., 0.],[-2., 1.],[1., 2.]])
-    row_norm_feat = data.utils.float_row_l1_normalize(features)
-    assert np.allclose(np.array([[1., 0.],[-2./3., 1./3.],[1./3., 2./3.]]),
-                       row_norm_feat)
-def test_float_col_normalize():
-    features = np.array([[1., 2., -3.]])
-    col_norm_feat = data.utils.float_col_l1_normalize(features)
-    assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
-    features = np.array([[1.], [2.], [-3.]])
-    col_norm_feat = data.utils.float_col_l1_normalize(features)
-    assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
-    features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
-    col_norm_feat = data.utils.float_col_l1_normalize(features)
-    assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
-                       col_norm_feat)
-    # input (2. 3)
-    features = np.array([[1., 0., 0.],[2., 1., -1.]])
-    col_norm_feat = data.utils.float_col_l1_normalize(features)
-    assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
-                       col_norm_feat)
-    # input (3. 2)
-    features = np.array([[1., 0.],[2., 1.],[1., -2.]])
-    col_norm_feat = data.utils.float_col_l1_normalize(features)
-    assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
-                       col_norm_feat)
-def test_float_col_maxmin_normalize():
-    features = np.array([[1., 2., -3.]])
-    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
-    assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
-    features = np.array([[1.], [2.], [-3.]])
-    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
-    assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
-    features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
-    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
-    assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
-                       col_norm_feat)
-    # input (2. 3)
-    features = np.array([[1., 0., 0.],[2., 1., -1.]])
-    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
-    assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
-                       col_norm_feat)
-    # input (3. 2)
-    features = np.array([[1., 0.],[2., 1.],[4., -2.]])
-    col_norm_feat = data.utils.float_col_maxmin_normalize(features)
-    assert np.allclose(np.array([[0., 2./3.],[1./3., 1.],[1., 0.]]),
-                       col_norm_feat)
-@unittest.skip("spacy language test is too heavy")
-def test_embed_word2vec():
-    import spacy
-    inputs = ['hello', 'world']
-    languages = ['en_core_web_lg', 'fr_core_news_lg']
-    nlps = [spacy.load(languages[0])]
-    feats = data.utils.embed_word2vec(inputs[0], nlps)
-    doc = nlps[0](inputs[0])
-    assert np.allclose(doc.vector, feats)
-    nlps.append(spacy.load(languages[1]))
-    for input in inputs:
-        feats = data.utils.embed_word2vec(input, nlps)
-        doc0 = nlps[0](input)
-        doc1 = nlps[1](input)
-        assert np.allclose(np.concatenate((doc0.vector, doc1.vector)),
-                           feats)
-@unittest.skip("spacy language test is too heavy")
-def test_parse_lang_feat():
-    import spacy
-    inputs = ['hello', 'world']
-    languages = ['en_core_web_lg', 'fr_core_news_lg']
-    nlps = [spacy.load(languages[0]), spacy.load(languages[1])]
-    feats = data.utils.parse_lang_feat(inputs, nlps)
-    res_feats = []
-    for input in inputs:
-        doc0 = nlps[0](input)
-        doc1 = nlps[1](input)
-        res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
-    res_feats = np.stack(res_feats)
-    assert np.allclose(feats, res_feats)
-    inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
-    feats = data.utils.parse_lang_feat(inputs, nlps)
-    res_feats = []
-    for input in inputs:
-        doc0 = nlps[0](input)
-        doc1 = nlps[1](input)
-        res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
-    res_feats = np.stack(res_feats)
-    assert np.allclose(feats, res_feats)
-    inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
-    feats = data.utils.parse_word2vec_feature(inputs, languages)
-    res_feats = []
-    for input in inputs:
-        doc0 = nlps[0](input)
-        doc1 = nlps[1](input)
-        res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
-    res_feats = np.stack(res_feats)
-    assert np.allclose(feats, res_feats)
-@unittest.skip("LabelBinarizer and MultiLabelBinarizer is not included in CI env")
-def test_parse_category_feat():
-    # single-hot
-    inputs = ['A', 'B']
-    feats = data.utils.parse_category_single_feat(inputs)
-    assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
-    inputs = ['A', 'B', 'C', 'A']
-    feats = data.utils.parse_category_single_feat(inputs)
-    assert np.allclose(np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
-    # col norm
-    feats = data.utils.parse_category_single_feat(inputs, norm='col')
-    assert np.allclose(np.array([[.5,0.,0.],[0.,1.,0.],[0.,0.,1.],[.5,0.,0.]]), feats)
-    # multi-hot
-    inputs = [['A'], ['B']]
-    feats = data.utils.parse_category_multi_feat(inputs)
-    assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
-    inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
-    feats = data.utils.parse_category_multi_feat(inputs)
-    assert np.allclose(np.array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
-    # row norm
-    feats = data.utils.parse_category_multi_feat(inputs, norm='row')
-    assert np.allclose(np.array([[1./3.,1./3.,1./3.],[.5,.5,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
-    # col norm
-    feats = data.utils.parse_category_multi_feat(inputs, norm='col')
-    assert np.allclose(np.array([[1./3.,0.5,0.5],[1./3.,0.5,0.],[0.,0.,0.5],[1./3.,0.,0.]]), feats)
-def test_parse_numerical_feat():
-    inputs = [[1., 2., -3.]]
-    feat = data.utils.parse_numerical_feat(inputs)
-    assert np.allclose(inputs, feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
-    assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
-    assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
-    inputs = [[1.], [2.], [-3.]]
-    feat = data.utils.parse_numerical_feat(inputs)
-    assert np.allclose(inputs, feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
-    assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
-    assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
-    inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
-    feat = data.utils.parse_numerical_feat(inputs)
-    assert np.allclose(inputs, feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
-    assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
-                       col_norm_feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
-    assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
-                       col_norm_feat)
-    # input (2. 3)
-    inputs = [[1., 0., 0.],[2., 1., -1.]]
-    feat = data.utils.parse_numerical_feat(inputs)
-    assert np.allclose(inputs, feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
-    assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
-                       col_norm_feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
-    assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
-                       col_norm_feat)
-    # input (3. 2)
-    inputs = [[1., 0.],[2., 1.],[1., -2.]]
-    feat = data.utils.parse_numerical_feat(inputs)
-    assert np.allclose(inputs, feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
-    assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
-                       col_norm_feat)
-    col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
-    assert np.allclose(np.array([[0., 2./3.],[1., 1.],[0., 0.]]),
-                       col_norm_feat)
-def test_parse_numerical_multihot_feat():
-    inputs = [0., 15., 20., 10.1, 25., 40.]
-    low = 10.
-    high = 30.
-    bucket_cnt = 2 #10~20, 20~30
-    window_size = 0.
-    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
-    assert np.allclose(np.array([[1., 0.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.]]), feat)
-    inputs = [0., 5., 15., 20., 10.1, 25., 30.1, 40.]
-    low = 10.
-    high = 30.
-    bucket_cnt = 4 #10~15,15~20,20~25,25~30
-    window_size = 10.
-    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
-    assert np.allclose(np.array([[1., 0., 0., 0],
-                                 [1., 0., 0., 0],
-                                 [1., 1., 1., 0.],
-                                 [0., 1., 1., 1.],
-                                 [1., 1., 0., 0.],
-                                 [0., 0., 1., 1.],
-                                 [0., 0., 0., 1.],
-                                 [0., 0., 0., 1.]]), feat)
-    # col norm
-    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='col')
-    assert np.allclose(np.array([[1./4., 0.,    0.,    0],
-                                 [1./4., 0.,    0.,    0],
-                                 [1./4., 1./3., 1./3., 0.],
-                                 [0.,    1./3., 1./3., 1./4.],
-                                 [1./4., 1./3., 0.,    0.],
-                                 [0.,    0.,    1./3., 1./4.],
-                                 [0.,    0.,    0.,    1./4.],
-                                 [0.,    0.,    0.,    1./4.]]), feat)
-    # row norm
-    feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='row')
-    assert np.allclose(np.array([[1., 0., 0., 0],
-                                 [1., 0., 0., 0],
-                                 [1./3., 1./3., 1./3., 0.],
-                                 [0., 1./3., 1./3., 1./3.],
-                                 [1./2., 1./2., 0., 0.],
-                                 [0., 0., 1./2., 1./2.],
-                                 [0., 0., 0., 1.],
-                                 [0., 0., 0., 1.]]), feat)
 if __name__ == '__main__':
-    #test_minigc()
+    test_minigc()
-    #test_data_hash()
+    test_data_hash()
-    test_row_normalize()
-    test_col_normalize()
-    test_float_row_normalize()
-    test_float_col_normalize()
-    test_float_col_maxmin_normalize()
-    #test_embed_word2vec()
-    #test_parse_lang_feat()
-    #test_parse_category_feat()
-    test_parse_numerical_feat()
-    test_parse_numerical_multihot_feat()
\ No newline at end of file