input_data.py 1.7 KB
Newer Older
1
"""
2
3
4
5
6
****************NOTE*****************
CREDITS : Thomas Kipf
since datasets are the same as those in kipf's implementation, 
Their preprocessing source was used as-is.
*************************************
7
"""
8
import pickle as pkl
9
10
import sys

11
import networkx as nx
12
import numpy as np
13
14
15
16
17
18
19
20
21
22
23
24
import scipy.sparse as sp


def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def load_data(dataset):
    # load the data: x, tx, allx, graph
25
    names = ["x", "tx", "allx", "graph"]
26
27
    objects = []
    for i in range(len(names)):
28
        with open("data/ind.{}.{}".format(dataset, names[i]), "rb") as f:
29
            if sys.version_info > (3, 0):
30
                objects.append(pkl.load(f, encoding="latin1"))
31
32
33
            else:
                objects.append(pkl.load(f))
    x, tx, allx, graph = tuple(objects)
34
35
36
    test_idx_reorder = parse_index_file(
        "data/ind.{}.test.index".format(dataset)
    )
37
38
    test_idx_range = np.sort(test_idx_reorder)

39
    if dataset == "citeseer":
40
41
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
42
43
44
        test_idx_range_full = range(
            min(test_idx_reorder), max(test_idx_reorder) + 1
        )
45
46
47
48
49
50
51
52
53
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    return adj, features