"...models/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "e2e511be5918fc61008f384d88bce06bd61070da"
Unverified Commit 95c0ff63 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Feature] add CSVDataset to load data from csv files (#3547)

* [Feature] add CSVDataset to load data from csv files

* add CSVDataset class file

* install pyyaml when running unit tests

* install pandas for unit tests

* utilize pydantic for YAML config check

* generate yaml and csv files during test

* make more keys as optional

* remove/rename several keys in yaml config and more tets though looks a bit clumsy

* fix test failure on mxnet

* pass /path/to/dataset instead of yaml path

* code refinement

* code refine

* change several yaml field such as feat and graph_id

* merge graph generation logic

* refine code

* Refactored_first_version

* DGLCSVDataset works for single heterograph

* add more tests

* fix test failure in mxnet

* add docstring

* use list comprehension for dict

* fix version in YAML

* refine data length assert

* use dict.pop directly

* remove ambiguous variable names

* refine graph id missing logic

* refine graph create call

* separate node/edge/graph data parser

* remove separator in DefaultDataParser

* refine validation error log for yaml field

* minor check

* refine code via dict.get()

* move load_from_csv into Node/Edge/GraphData

* move _parse_node/edge/graph_data into Node/Edge/GraphData

* refine id-related fields check

* check duplicate ntypes/etypes when load yaml

* refine docstring
parent c04b5bc7
...@@ -29,6 +29,7 @@ from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset ...@@ -29,6 +29,7 @@ from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset
from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
from .fakenews import FakeNewsDataset from .fakenews import FakeNewsDataset
from .csv_dataset import DGLCSVDataset
def register_data_args(parser): def register_data_args(parser):
......
import os
import yaml
from yaml.loader import SafeLoader
import pandas as pd
import numpy as np
from typing import List, Optional
import pydantic as dt
from .dgl_dataset import DGLDataset
from ..convert import heterograph as dgl_heterograph
from .. import backend as F
from .utils import save_graphs, load_graphs
from ..base import dgl_warning, DGLError
import abc
import ast
from typing import Callable
class MetaNode(dt.BaseModel):
""" Class of node_data in YAML. Internal use only. """
file_name: str
ntype: Optional[str] = '_V'
graph_id_field: Optional[str] = 'graph_id'
node_id_field: Optional[str] = 'node_id'
class MetaEdge(dt.BaseModel):
""" Class of edge_data in YAML. Internal use only. """
file_name: str
etype: Optional[List[str]] = ['_V', '_E', '_V']
graph_id_field: Optional[str] = 'graph_id'
src_id_field: Optional[str] = 'src_id'
dst_id_field: Optional[str] = 'dst_id'
class MetaGraph(dt.BaseModel):
""" Class of graph_data in YAML. Internal use only. """
file_name: str
graph_id_field: Optional[str] = 'graph_id'
class MetaYaml(dt.BaseModel):
""" Class of YAML. Internal use only. """
version: Optional[str] = '1.0.0'
dataset_name: str
separator: Optional[str] = ','
node_data: List[MetaNode]
edge_data: List[MetaEdge]
graph_data: Optional[MetaGraph] = None
def load_yaml_with_sanity_check(yaml_file):
""" Load yaml and do sanity check. Internal use only. """
with open(yaml_file) as f:
yaml_data = yaml.load(f, Loader=SafeLoader)
try:
meta_yaml = MetaYaml(**yaml_data)
except dt.ValidationError as e:
print(
"Details of pydantic.ValidationError:\n{}".format(e.json()))
raise DGLError(
"Validation Error for YAML fields. Details are shown above.")
if meta_yaml.version != '1.0.0':
raise DGLError("Invalid CSVDataset version {}. Supported versions: '1.0.0'".format(
meta_yaml.version))
ntypes = [meta.ntype for meta in meta_yaml.node_data]
if len(ntypes) > len(set(ntypes)):
raise DGLError(
"Each node CSV file must have a unique node type name, but found duplicate node type: {}.".format(ntypes))
etypes = [tuple(meta.etype) for meta in meta_yaml.edge_data]
if len(etypes) > len(set(etypes)):
raise DGLError(
"Each edge CSV file must have a unique edge type name, but found duplicate edge type: {}.".format(etypes))
return meta_yaml
def _validate_data_length(data_dict):
len_dict = {k: len(v) for k, v in data_dict.items()}
lst = list(len_dict.values())
res = lst.count(lst[0]) == len(lst)
if not res:
raise DGLError(
"All data are required to have same length while some of them does not. Length of data={}".format(str(len_dict)))
class BaseData:
""" Class of base data which is inherited by Node/Edge/GraphData. Internal use only. """
@staticmethod
def read_csv(file_name, base_dir, separator):
csv_path = file_name
if base_dir is not None:
csv_path = os.path.join(base_dir, csv_path)
return pd.read_csv(csv_path, sep=separator)
@staticmethod
def pop_from_dataframe(df: pd.DataFrame, item: str):
ret = None
try:
ret = df.pop(item).to_numpy().squeeze()
except KeyError:
pass
return ret
class NodeData(BaseData):
""" Class of node data which is used for DGLGraph construction. Internal use only. """
def __init__(self, node_id, data, type=None, graph_id=None):
self.id = np.array(node_id, dtype=np.int64)
self.data = data
self.type = type if type is not None else '_V'
self.graph_id = np.array(graph_id, dtype=np.int) if graph_id is not None else np.full(
len(node_id), 0)
_validate_data_length({**{'id': self.id, 'graph_id': self.graph_id}, **self.data})
@staticmethod
def load_from_csv(meta: MetaNode, data_parser: Callable, base_dir=None, separator=','):
df = BaseData.read_csv(meta.file_name, base_dir, separator)
node_ids = BaseData.pop_from_dataframe(df, meta.node_id_field)
graph_ids = BaseData.pop_from_dataframe(df, meta.graph_id_field)
if node_ids is None:
raise DGLError("Missing node id field [{}] in file [{}].".format(
meta.node_id_field, meta.file_name))
ntype = meta.ntype
ndata = data_parser(df)
return NodeData(node_ids, ndata, type=ntype, graph_id=graph_ids)
@staticmethod
def to_dict(node_data: List['NodeData']) -> dict:
# node_ids could be arbitrary numeric values, namely non-sorted, duplicated, not labeled from 0 to num_nodes-1
node_dict = {}
for n_data in node_data:
graph_ids = np.unique(n_data.graph_id)
for graph_id in graph_ids:
idx = n_data.graph_id == graph_id
ids = n_data.id[idx]
u_ids, u_indices = np.unique(ids, return_index=True)
if len(ids) > len(u_ids):
dgl_warning(
"There exist duplicated ids and only the first ones are kept.")
if graph_id not in node_dict:
node_dict[graph_id] = {}
node_dict[graph_id][n_data.type] = {'mapping': {index: i for i,
index in enumerate(ids[u_indices])},
'data': {k: F.tensor(v[idx][u_indices])
for k, v in n_data.data.items()}}
return node_dict
class EdgeData(BaseData):
""" Class of edge data which is used for DGLGraph construction. Internal use only. """
def __init__(self, src_id, dst_id, data, type=None, graph_id=None):
self.src = np.array(src_id, dtype=np.int64)
self.dst = np.array(dst_id, dtype=np.int64)
self.data = data
self.type = type if type is not None else ('_V', '_E', '_V')
self.graph_id = np.array(graph_id, dtype=np.int) if graph_id is not None else np.full(
len(src_id), 0)
_validate_data_length({**{'src': self.src, 'dst': self.dst, 'graph_id': self.graph_id}, **self.data})
@staticmethod
def load_from_csv(meta: MetaEdge, data_parser: Callable, base_dir=None, separator=','):
df = BaseData.read_csv(meta.file_name, base_dir, separator)
src_ids = BaseData.pop_from_dataframe(df, meta.src_id_field)
if src_ids is None:
raise DGLError("Missing src id field [{}] in file [{}].".format(
meta.src_id_field, meta.file_name))
dst_ids = BaseData.pop_from_dataframe(df, meta.dst_id_field)
if dst_ids is None:
raise DGLError("Missing dst id field [{}] in file [{}].".format(
meta.dst_id_field, meta.file_name))
graph_ids = BaseData.pop_from_dataframe(df, meta.graph_id_field)
etype = tuple(meta.etype)
edata = data_parser(df)
return EdgeData(src_ids, dst_ids, edata, type=etype, graph_id=graph_ids)
@staticmethod
def to_dict(edge_data: List['EdgeData'], node_dict: dict) -> dict:
edge_dict = {}
for e_data in edge_data:
(src_type, e_type, dst_type) = e_data.type
graph_ids = np.unique(e_data.graph_id)
for graph_id in graph_ids:
if graph_id in edge_dict and e_data.type in edge_dict[graph_id]:
raise DGLError(f"Duplicate edge type[{e_data.type}] for same graph[{graph_id}], please place the same edge_type for same graph into single EdgeData.")
idx = e_data.graph_id == graph_id
src_mapping = node_dict[graph_id][src_type]['mapping']
dst_mapping = node_dict[graph_id][dst_type]['mapping']
src_ids = [src_mapping[index] for index in e_data.src[idx]]
dst_ids = [dst_mapping[index] for index in e_data.dst[idx]]
if graph_id not in edge_dict:
edge_dict[graph_id] = {}
edge_dict[graph_id][e_data.type] = {'edges': (F.tensor(src_ids), F.tensor(dst_ids)),
'data': {k: F.tensor(v[idx])
for k, v in e_data.data.items()}}
return edge_dict
class GraphData(BaseData):
""" Class of graph data which is used for DGLGraph construction. Internal use only. """
def __init__(self, graph_id, data):
self.graph_id = np.array(graph_id, dtype=np.int64)
self.data = data
_validate_data_length({**{'graph_id': self.graph_id}, **self.data})
@staticmethod
def load_from_csv(meta: MetaGraph, data_parser: Callable, base_dir=None, separator=','):
df = BaseData.read_csv(meta.file_name, base_dir, separator)
graph_ids = BaseData.pop_from_dataframe(df, meta.graph_id_field)
if graph_ids is None:
raise DGLError("Missing graph id field [{}] in file [{}].".format(
meta.graph_id_field, meta.file_name))
gdata = data_parser(df)
return GraphData(graph_ids, gdata)
@staticmethod
def to_dict(graph_data: 'GraphData', graphs_dict: dict) -> dict:
missing_ids = np.setdiff1d(
np.array(list(graphs_dict.keys())), graph_data.graph_id)
if len(missing_ids) > 0:
raise DGLError(
"Found following graph ids in node/edge CSVs but not in graph CSV: {}.".format(missing_ids))
graph_ids = graph_data.graph_id
graphs = []
for graph_id in graph_ids:
if graph_id not in graphs_dict:
graphs_dict[graph_id] = dgl_heterograph(
{('_V', '_E', '_V'): ([], [])})
for graph_id in graph_ids:
graphs.append(graphs_dict[graph_id])
data = {k: F.tensor(v) for k, v in graph_data.data.items()}
return graphs, data
class DGLGraphConstructor:
""" Class for constructing DGLGraph from Node/Edge/Graph data. Internal use only. """
@staticmethod
def construct_graphs(node_data, edge_data, graph_data=None):
if not isinstance(node_data, list):
node_data = [node_data]
if not isinstance(edge_data, list):
edge_data = [edge_data]
node_dict = NodeData.to_dict(node_data)
edge_dict = EdgeData.to_dict(edge_data, node_dict)
graph_dict = DGLGraphConstructor._construct_graphs(
node_dict, edge_dict)
if graph_data is None:
graph_data = GraphData(np.full(1, 0), {})
graphs, data = GraphData.to_dict(
graph_data, graph_dict)
return graphs, data
@staticmethod
def _construct_graphs(node_dict, edge_dict):
graph_dict = {}
for graph_id in node_dict:
if graph_id not in edge_dict:
edge_dict[graph_id][('_V', '_E', '_V')] = {'edges': ([], [])}
graph = dgl_heterograph({etype: edata['edges']
for etype, edata in edge_dict[graph_id].items()},
num_nodes_dict={ntype: len(ndata['mapping'])
for ntype, ndata in node_dict[graph_id].items()})
def assign_data(type, src_data, dst_data):
for key, value in src_data.items():
dst_data[type].data[key] = value
for type, data in node_dict[graph_id].items():
assign_data(type, data['data'], graph.nodes)
for (type), data in edge_dict[graph_id].items():
assign_data(type, data['data'], graph.edges)
graph_dict[graph_id] = graph
return graph_dict
class DefaultDataParser:
""" Default data parser for DGLCSVDataset. It
1. ignores any columns which does not have a header.
2. tries to convert to list of numeric values(generated by
np.array().tolist()) if cell data is a str separated by ','.
3. read data and infer data type directly, otherwise.
"""
def __call__(self, df: pd.DataFrame):
data = {}
for header in df:
if 'Unnamed' in header:
dgl_warning("Unamed column is found. Ignored...")
continue
dt = df[header].to_numpy().squeeze()
if len(dt) > 0 and isinstance(dt[0], str):
#probably consists of list of numeric values
dt = np.array([ast.literal_eval(row) for row in dt])
data[header] = dt
return data
class DGLCSVDataset(DGLDataset):
""" This class aims to parse data from CSV files, construct DGLGraph
and behaves as a DGLDataset.
Parameters
----------
data_path : str
Directory which contains 'meta.yaml' and CSV files
force_reload : bool, optional
Whether to reload the dataset. Default: False
verbose: bool, optional
Whether to print out progress information. Default: True.
node_data_parser : dict[str, callable], optional
A dictionary used for node data parsing when loading from CSV files.
The key is node type which specifies the header in CSV file and the
value is a callable object which is used to parse corresponding
column data. Default: None. If None, a default data parser is applied
which load data directly and tries to convert list into array.
edge_data_parser : dict[(str, str, str), callable], optional
A dictionary used for edge data parsing when loading from CSV files.
The key is edge type which specifies the header in CSV file and the
value is a callable object which is used to parse corresponding
column data. Default: None. If None, a default data parser is applied
which load data directly and tries to convert list into array.
graph_data_parser : callable, optional
A callable object which is used to parse corresponding column graph
data. Default: None. If None, a default data parser is applied
which load data directly and tries to convert list into array.
Attributes
----------
graphs : :class:`dgl.DGLGraph`
Graphs of the dataset
data : dict
any available graph-level data such as graph-level feature, labels.
Examples
[TODO]: link to a detailed web page.
"""
META_YAML_NAME = 'meta.yaml'
def __init__(self, data_path, force_reload=False, verbose=True, node_data_parser=None, edge_data_parser=None, graph_data_parser=None):
self.graphs = None
self.data = None
self.node_data_parser = {} if node_data_parser is None else node_data_parser
self.edge_data_parser = {} if edge_data_parser is None else edge_data_parser
self.graph_data_parser = graph_data_parser
self.default_data_parser = DefaultDataParser()
meta_yaml_path = os.path.join(data_path, DGLCSVDataset.META_YAML_NAME)
if not os.path.exists(meta_yaml_path):
raise DGLError(
"'{}' cannot be found under {}.".format(DGLCSVDataset.META_YAML_NAME, data_path))
self.meta_yaml = load_yaml_with_sanity_check(meta_yaml_path)
ds_name = self.meta_yaml.dataset_name
super().__init__(ds_name, raw_dir=os.path.dirname(
meta_yaml_path), force_reload=force_reload, verbose=verbose)
def process(self):
"""Parse node/edge data from CSV files and construct DGL.Graphs
"""
meta_yaml = self.meta_yaml
base_dir = self.raw_dir
node_data = []
for meta_node in meta_yaml.node_data:
if meta_node is None:
continue
ntype = meta_node.ntype
data_parser = self.node_data_parser.get(
ntype, self.default_data_parser)
ndata = NodeData.load_from_csv(
meta_node, base_dir=base_dir, separator=meta_yaml.separator, data_parser=data_parser)
node_data.append(ndata)
edge_data = []
for meta_edge in meta_yaml.edge_data:
if meta_edge is None:
continue
etype = tuple(meta_edge.etype)
data_parser = self.edge_data_parser.get(
etype, self.default_data_parser)
edata = EdgeData.load_from_csv(
meta_edge, base_dir=base_dir, separator=meta_yaml.separator, data_parser=data_parser)
edge_data.append(edata)
graph_data = None
if meta_yaml.graph_data is not None:
meta_graph = meta_yaml.graph_data
data_parser = self.default_data_parser if self.graph_data_parser is None else self.graph_data_parser
graph_data = GraphData.load_from_csv(
meta_graph, base_dir=base_dir, separator=meta_yaml.separator, data_parser=data_parser)
# construct graphs
self.graphs, self.data = DGLGraphConstructor.construct_graphs(
node_data, edge_data, graph_data)
def has_cache(self):
graph_path = os.path.join(self.save_path,
self.name + '.bin')
if os.path.exists(graph_path):
return True
return False
def save(self):
if self.graphs is None:
raise DGLError("No graphs available in dataset")
graph_path = os.path.join(self.save_path,
self.name + '.bin')
save_graphs(graph_path, self.graphs,
labels=self.data)
def load(self):
graph_path = os.path.join(self.save_path,
self.name + '.bin')
self.graphs, self.data = load_graphs(graph_path)
def __getitem__(self, i):
if 'label' in self.data:
return self.graphs[i], self.data['label'][i]
else:
return self.graphs[i]
def __len__(self):
return len(self.graphs)
import dgl.data as data
import unittest import unittest
import backend as F import backend as F
import numpy as np import numpy as np
import gzip import gzip
import tempfile import tempfile
import os import os
import pandas as pd
import yaml
import pytest
import dgl.data as data
import dgl.data.csv_dataset as csv_ds
from dgl import DGLError
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
...@@ -50,7 +55,7 @@ def test_fakenews(): ...@@ -50,7 +55,7 @@ def test_fakenews():
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_tudataset_regression(): def test_tudataset_regression():
ds = data.TUDataset('ZINC_test', force_reload=True) ds = data.TUDataset('ZINC_test', force_reload=True)
assert len(ds) == 5000 assert len(ds) == 5000
...@@ -59,13 +64,15 @@ def test_tudataset_regression(): ...@@ -59,13 +64,15 @@ def test_tudataset_regression():
def test_data_hash(): def test_data_hash():
class HashTestDataset(data.DGLDataset): class HashTestDataset(data.DGLDataset):
def __init__(self, hash_key=()): def __init__(self, hash_key=()):
super(HashTestDataset, self).__init__('hashtest', hash_key=hash_key) super(HashTestDataset, self).__init__(
'hashtest', hash_key=hash_key)
def _load(self): def _load(self):
pass pass
a = HashTestDataset((True, 0, '1', (1,2,3))) a = HashTestDataset((True, 0, '1', (1, 2, 3)))
b = HashTestDataset((True, 0, '1', (1,2,3))) b = HashTestDataset((True, 0, '1', (1, 2, 3)))
c = HashTestDataset((True, 0, '1', (1,2,4))) c = HashTestDataset((True, 0, '1', (1, 2, 4)))
assert a.hash == b.hash assert a.hash == b.hash
assert a.hash != c.hash assert a.hash != c.hash
...@@ -156,6 +163,849 @@ def test_extract_archive(): ...@@ -156,6 +163,849 @@ def test_extract_archive():
assert os.path.exists(os.path.join(dst_dir, gz_file)) assert os.path.exists(os.path.join(dst_dir, gz_file))
def _test_construct_graphs_homo():
# node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1
num_nodes = 100
num_edges = 1000
num_dims = 3
num_dup_nodes = int(num_nodes*0.2)
node_ids = np.random.choice(
np.arange(num_nodes*2), size=num_nodes, replace=False)
assert len(node_ids) == num_nodes
np.random.shuffle(node_ids)
node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes]))
t_ndata = {'feat': np.random.rand(num_nodes+num_dup_nodes, num_dims),
'label': np.random.randint(2, size=num_nodes+num_dup_nodes)}
_, u_indices = np.unique(node_ids, return_index=True)
ndata = {'feat': t_ndata['feat'][u_indices],
'label': t_ndata['label'][u_indices]}
node_data = csv_ds.NodeData(node_ids, t_ndata)
src_ids = np.random.choice(node_ids, size=num_edges)
dst_ids = np.random.choice(node_ids, size=num_edges)
edata = {'feat': np.random.rand(
num_edges, num_dims), 'label': np.random.randint(2, size=num_edges)}
edge_data = csv_ds.EdgeData(src_ids, dst_ids, edata)
graphs, data_dict = csv_ds.DGLGraphConstructor.construct_graphs(
node_data, edge_data)
assert len(graphs) == 1
assert len(data_dict) == 0
g = graphs[0]
assert g.is_homogeneous
assert g.num_nodes() == num_nodes
assert g.num_edges() == num_edges
def assert_data(lhs, rhs):
for key, value in lhs.items():
assert key in rhs
assert F.array_equal(F.tensor(value), rhs[key])
assert_data(ndata, g.ndata)
assert_data(edata, g.edata)
def _test_construct_graphs_hetero():
# node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1
num_nodes = 100
num_edges = 1000
num_dims = 3
num_dup_nodes = int(num_nodes*0.2)
ntypes = ['user', 'item']
node_data = []
node_ids_dict = {}
ndata_dict = {}
for ntype in ntypes:
node_ids = np.random.choice(
np.arange(num_nodes*2), size=num_nodes, replace=False)
assert len(node_ids) == num_nodes
np.random.shuffle(node_ids)
node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes]))
t_ndata = {'feat': np.random.rand(num_nodes+num_dup_nodes, num_dims),
'label': np.random.randint(2, size=num_nodes+num_dup_nodes)}
_, u_indices = np.unique(node_ids, return_index=True)
ndata = {'feat': t_ndata['feat'][u_indices],
'label': t_ndata['label'][u_indices]}
node_data.append(csv_ds.NodeData(node_ids, t_ndata, type=ntype))
node_ids_dict[ntype] = node_ids
ndata_dict[ntype] = ndata
etypes = [('user', 'follow', 'user'), ('user', 'like', 'item')]
edge_data = []
edata_dict = {}
for src_type, e_type, dst_type in etypes:
src_ids = np.random.choice(node_ids_dict[src_type], size=num_edges)
dst_ids = np.random.choice(node_ids_dict[dst_type], size=num_edges)
edata = {'feat': np.random.rand(
num_edges, num_dims), 'label': np.random.randint(2, size=num_edges)}
edge_data.append(csv_ds.EdgeData(src_ids, dst_ids, edata,
type=(src_type, e_type, dst_type)))
edata_dict[(src_type, e_type, dst_type)] = edata
graphs, data_dict = csv_ds.DGLGraphConstructor.construct_graphs(
node_data, edge_data)
assert len(graphs) == 1
assert len(data_dict) == 0
g = graphs[0]
assert not g.is_homogeneous
assert g.num_nodes() == num_nodes*len(ntypes)
assert g.num_edges() == num_edges*len(etypes)
def assert_data(lhs, rhs):
for key, value in lhs.items():
assert key in rhs
assert F.array_equal(F.tensor(value), rhs[key])
for ntype in g.ntypes:
assert g.num_nodes(ntype) == num_nodes
assert_data(ndata_dict[ntype], g.nodes[ntype].data)
for etype in g.canonical_etypes:
assert g.num_edges(etype) == num_edges
assert_data(edata_dict[etype], g.edges[etype].data)
def _test_construct_graphs_multiple():
num_nodes = 100
num_edges = 1000
num_graphs = 10
num_dims = 3
node_ids = np.array([], dtype=np.int)
src_ids = np.array([], dtype=np.int)
dst_ids = np.array([], dtype=np.int)
ngraph_ids = np.array([], dtype=np.int)
egraph_ids = np.array([], dtype=np.int)
u_indices = np.array([], dtype=np.int)
for i in range(num_graphs):
l_node_ids = np.random.choice(
np.arange(num_nodes*2), size=num_nodes, replace=False)
node_ids = np.append(node_ids, l_node_ids)
_, l_u_indices = np.unique(l_node_ids, return_index=True)
u_indices = np.append(u_indices, l_u_indices)
ngraph_ids = np.append(ngraph_ids, np.full(num_nodes, i))
src_ids = np.append(src_ids, np.random.choice(
l_node_ids, size=num_edges))
dst_ids = np.append(dst_ids, np.random.choice(
l_node_ids, size=num_edges))
egraph_ids = np.append(egraph_ids, np.full(num_edges, i))
ndata = {'feat': np.random.rand(num_nodes*num_graphs, num_dims),
'label': np.random.randint(2, size=num_nodes*num_graphs)}
node_data = csv_ds.NodeData(node_ids, ndata, graph_id=ngraph_ids)
edata = {'feat': np.random.rand(
num_edges*num_graphs, num_dims), 'label': np.random.randint(2, size=num_edges*num_graphs)}
edge_data = csv_ds.EdgeData(src_ids, dst_ids, edata, graph_id=egraph_ids)
gdata = {'feat': np.random.rand(num_graphs, num_dims),
'label': np.random.randint(2, size=num_graphs)}
graph_data = csv_ds.GraphData(np.arange(num_graphs), gdata)
graphs, data_dict = csv_ds.DGLGraphConstructor.construct_graphs(
node_data, edge_data, graph_data)
assert len(graphs) == num_graphs
assert len(data_dict) == len(gdata)
for k, v in data_dict.items():
assert F.array_equal(F.tensor(gdata[k]), v)
for i, g in enumerate(graphs):
assert g.is_homogeneous
assert g.num_nodes() == num_nodes
assert g.num_edges() == num_edges
def assert_data(lhs, rhs, size, node=False):
for key, value in lhs.items():
assert key in rhs
value = value[i*size:(i+1)*size]
if node:
indices = u_indices[i*size:(i+1)*size]
value = value[indices]
assert F.array_equal(F.tensor(value), rhs[key])
assert_data(ndata, g.ndata, num_nodes, node=True)
assert_data(edata, g.edata, num_edges)
# Graph IDs found in node/edge CSV but not in graph CSV
graph_data = csv_ds.GraphData(np.arange(num_graphs-2), {})
expect_except = False
try:
_, _ = csv_ds.DGLGraphConstructor.construct_graphs(
node_data, edge_data, graph_data)
except:
expect_except = True
assert expect_except
def _test_DefaultDataParser():
# common csv
with tempfile.TemporaryDirectory() as test_dir:
csv_path = os.path.join(test_dir, "nodes.csv")
num_nodes = 5
num_labels = 3
num_dims = 2
node_id = np.arange(num_nodes)
label = np.random.randint(num_labels, size=num_nodes)
feat = np.random.rand(num_nodes, num_dims)
df = pd.DataFrame({'node_id': node_id, 'label': label,
'feat': [line.tolist() for line in feat],
})
df.to_csv(csv_path, index=False)
dp = csv_ds.DefaultDataParser()
df = pd.read_csv(csv_path)
dt = dp(df)
assert np.array_equal(node_id, dt['node_id'])
assert np.array_equal(label, dt['label'])
assert np.array_equal(feat, dt['feat'])
# string consists of non-numeric values
with tempfile.TemporaryDirectory() as test_dir:
csv_path = os.path.join(test_dir, "nodes.csv")
df = pd.DataFrame({'label': ['a', 'b', 'c'],
})
df.to_csv(csv_path, index=False)
dp = csv_ds.DefaultDataParser()
df = pd.read_csv(csv_path)
expect_except = False
try:
dt = dp(df)
except:
expect_except = True
assert expect_except
# csv has index column which is ignored as it's unnamed
with tempfile.TemporaryDirectory() as test_dir:
csv_path = os.path.join(test_dir, "nodes.csv")
df = pd.DataFrame({'label': [1, 2, 3],
})
df.to_csv(csv_path)
dp = csv_ds.DefaultDataParser()
df = pd.read_csv(csv_path)
dt = dp(df)
assert len(dt) == 1
def _test_load_yaml_with_sanity_check():
with tempfile.TemporaryDirectory() as test_dir:
yaml_path = os.path.join(test_dir, 'meta.yaml')
# workable but meaningless usually
yaml_data = {'dataset_name': 'default',
'node_data': [], 'edge_data': []}
with open(yaml_path, 'w') as f:
yaml.dump(yaml_data, f, sort_keys=False)
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
assert meta.version == '1.0.0'
assert meta.dataset_name == 'default'
assert meta.separator == ','
assert len(meta.node_data) == 0
assert len(meta.edge_data) == 0
assert meta.graph_data is None
# minimum with required fields only
yaml_data = {'version': '1.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes.csv'}],
'edge_data': [{'file_name': 'edges.csv'}],
}
with open(yaml_path, 'w') as f:
yaml.dump(yaml_data, f, sort_keys=False)
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
for ndata in meta.node_data:
assert ndata.file_name == 'nodes.csv'
assert ndata.ntype == '_V'
assert ndata.graph_id_field == 'graph_id'
assert ndata.node_id_field == 'node_id'
for edata in meta.edge_data:
assert edata.file_name == 'edges.csv'
assert edata.etype == ['_V', '_E', '_V']
assert edata.graph_id_field == 'graph_id'
assert edata.src_id_field == 'src_id'
assert edata.dst_id_field == 'dst_id'
# optional fields are specified
yaml_data = {'version': '1.0.0', 'dataset_name': 'default',
'separator': '|',
'node_data': [{'file_name': 'nodes.csv', 'ntype': 'user', 'graph_id_field': 'xxx', 'node_id_field': 'xxx'}],
'edge_data': [{'file_name': 'edges.csv', 'etype': ['user', 'follow', 'user'], 'graph_id_field':'xxx', 'src_id_field':'xxx', 'dst_id_field':'xxx'}],
'graph_data': {'file_name': 'graph.csv', 'graph_id_field': 'xxx'}
}
with open(yaml_path, 'w') as f:
yaml.dump(yaml_data, f, sort_keys=False)
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
assert len(meta.node_data) == 1
ndata = meta.node_data[0]
assert ndata.ntype == 'user'
assert ndata.graph_id_field == 'xxx'
assert ndata.node_id_field == 'xxx'
assert len(meta.edge_data) == 1
edata = meta.edge_data[0]
assert edata.etype == ['user', 'follow', 'user']
assert edata.graph_id_field == 'xxx'
assert edata.src_id_field == 'xxx'
assert edata.dst_id_field == 'xxx'
assert meta.graph_data is not None
assert meta.graph_data.file_name == 'graph.csv'
assert meta.graph_data.graph_id_field == 'xxx'
# some required fields are missing
yaml_data = {'dataset_name': 'default',
'node_data': [], 'edge_data': []}
for field in yaml_data.keys():
ydata = {k: v for k, v in yaml_data.items()}
ydata.pop(field)
with open(yaml_path, 'w') as f:
yaml.dump(ydata, f, sort_keys=False)
expect_except = False
try:
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
except:
expect_except = True
assert expect_except
# inapplicable version
yaml_data = {'version': '0.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes_0.csv'}],
'edge_data': [{'file_name': 'edges_0.csv'}],
}
with open(yaml_path, 'w') as f:
yaml.dump(yaml_data, f, sort_keys=False)
expect_except = False
try:
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
except DGLError:
expect_except = True
assert expect_except
# duplicate node types
yaml_data = {'version': '1.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes.csv'}, {'file_name': 'nodes.csv'}],
'edge_data': [{'file_name': 'edges.csv'}],
}
with open(yaml_path, 'w') as f:
yaml.dump(yaml_data, f, sort_keys=False)
expect_except = False
try:
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
except DGLError:
expect_except = True
assert expect_except
# duplicate edge types
yaml_data = {'version': '1.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes.csv'}],
'edge_data': [{'file_name': 'edges.csv'}, {'file_name': 'edges.csv'}],
}
with open(yaml_path, 'w') as f:
yaml.dump(yaml_data, f, sort_keys=False)
expect_except = False
try:
meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
except DGLError:
expect_except = True
assert expect_except
def _test_load_node_data_from_csv():
with tempfile.TemporaryDirectory() as test_dir:
num_nodes = 100
# minimum
df = pd.DataFrame({'node_id': np.arange(num_nodes)})
csv_path = os.path.join(test_dir, 'nodes.csv')
df.to_csv(csv_path, index=False)
meta_node = csv_ds.MetaNode(file_name=csv_path)
node_data = csv_ds.NodeData.load_from_csv(
meta_node, csv_ds.DefaultDataParser())
assert np.array_equal(df['node_id'], node_data.id)
assert len(node_data.data) == 0
# common case
df = pd.DataFrame({'node_id': np.arange(num_nodes),
'label': np.random.randint(3, size=num_nodes)})
csv_path = os.path.join(test_dir, 'nodes.csv')
df.to_csv(csv_path, index=False)
meta_node = csv_ds.MetaNode(file_name=csv_path)
node_data = csv_ds.NodeData.load_from_csv(
meta_node, csv_ds.DefaultDataParser())
assert np.array_equal(df['node_id'], node_data.id)
assert len(node_data.data) == 1
assert np.array_equal(df['label'], node_data.data['label'])
assert np.array_equal(np.full(num_nodes, 0), node_data.graph_id)
assert node_data.type == '_V'
# add more fields into nodes.csv
df = pd.DataFrame({'node_id': np.arange(num_nodes), 'label': np.random.randint(
3, size=num_nodes), 'graph_id': np.full(num_nodes, 1)})
csv_path = os.path.join(test_dir, 'nodes.csv')
df.to_csv(csv_path, index=False)
meta_node = csv_ds.MetaNode(file_name=csv_path)
node_data = csv_ds.NodeData.load_from_csv(
meta_node, csv_ds.DefaultDataParser())
assert np.array_equal(df['node_id'], node_data.id)
assert len(node_data.data) == 1
assert np.array_equal(df['label'], node_data.data['label'])
assert np.array_equal(df['graph_id'], node_data.graph_id)
assert node_data.type == '_V'
# required header is missing
df = pd.DataFrame({'label': np.random.randint(3, size=num_nodes)})
csv_path = os.path.join(test_dir, 'nodes.csv')
df.to_csv(csv_path, index=False)
meta_node = csv_ds.MetaNode(file_name=csv_path)
expect_except = False
try:
csv_ds.NodeData.load_from_csv(
meta_node, csv_ds.DefaultDataParser())
except:
expect_except = True
assert expect_except
def _test_load_edge_data_from_csv():
with tempfile.TemporaryDirectory() as test_dir:
num_nodes = 100
num_edges = 1000
# minimum
df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
'dst_id': np.random.randint(num_nodes, size=num_edges),
})
csv_path = os.path.join(test_dir, 'edges.csv')
df.to_csv(csv_path, index=False)
meta_edge = csv_ds.MetaEdge(file_name=csv_path)
edge_data = csv_ds.EdgeData.load_from_csv(
meta_edge, csv_ds.DefaultDataParser())
assert np.array_equal(df['src_id'], edge_data.src)
assert np.array_equal(df['dst_id'], edge_data.dst)
assert len(edge_data.data) == 0
# common case
df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
'dst_id': np.random.randint(num_nodes, size=num_edges),
'label': np.random.randint(3, size=num_edges)})
csv_path = os.path.join(test_dir, 'edges.csv')
df.to_csv(csv_path, index=False)
meta_edge = csv_ds.MetaEdge(file_name=csv_path)
edge_data = csv_ds.EdgeData.load_from_csv(
meta_edge, csv_ds.DefaultDataParser())
assert np.array_equal(df['src_id'], edge_data.src)
assert np.array_equal(df['dst_id'], edge_data.dst)
assert len(edge_data.data) == 1
assert np.array_equal(df['label'], edge_data.data['label'])
assert np.array_equal(np.full(num_edges, 0), edge_data.graph_id)
assert edge_data.type == ('_V', '_E', '_V')
# add more fields into edges.csv
df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
'dst_id': np.random.randint(num_nodes, size=num_edges),
'graph_id': np.arange(num_edges),
'feat': np.random.randint(3, size=num_edges),
'label': np.random.randint(3, size=num_edges)})
csv_path = os.path.join(test_dir, 'edges.csv')
df.to_csv(csv_path, index=False)
meta_edge = csv_ds.MetaEdge(file_name=csv_path)
edge_data = csv_ds.EdgeData.load_from_csv(
meta_edge, csv_ds.DefaultDataParser())
assert np.array_equal(df['src_id'], edge_data.src)
assert np.array_equal(df['dst_id'], edge_data.dst)
assert len(edge_data.data) == 2
assert np.array_equal(df['feat'], edge_data.data['feat'])
assert np.array_equal(df['label'], edge_data.data['label'])
assert np.array_equal(df['graph_id'], edge_data.graph_id)
assert edge_data.type == ('_V', '_E', '_V')
# required headers are missing
df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
})
csv_path = os.path.join(test_dir, 'edges.csv')
df.to_csv(csv_path, index=False)
meta_edge = csv_ds.MetaEdge(file_name=csv_path)
expect_except = False
try:
csv_ds.EdgeData.load_from_csv(
meta_edge, csv_ds.DefaultDataParser())
except DGLError:
expect_except = True
assert expect_except
df = pd.DataFrame({'dst_id': np.random.randint(num_nodes, size=num_edges),
})
csv_path = os.path.join(test_dir, 'edges.csv')
df.to_csv(csv_path, index=False)
meta_edge = csv_ds.MetaEdge(file_name=csv_path)
expect_except = False
try:
csv_ds.EdgeData.load_from_csv(
meta_edge, csv_ds.DefaultDataParser())
except DGLError:
expect_except = True
assert expect_except
def _test_load_graph_data_from_csv():
with tempfile.TemporaryDirectory() as test_dir:
num_graphs = 100
# minimum
df = pd.DataFrame({'graph_id': np.arange(num_graphs)})
csv_path = os.path.join(test_dir, 'graph.csv')
df.to_csv(csv_path, index=False)
meta_graph = csv_ds.MetaGraph(file_name=csv_path)
graph_data = csv_ds.GraphData.load_from_csv(
meta_graph, csv_ds.DefaultDataParser())
assert np.array_equal(df['graph_id'], graph_data.graph_id)
assert len(graph_data.data) == 0
# common case
df = pd.DataFrame({'graph_id': np.arange(num_graphs),
'label': np.random.randint(3, size=num_graphs)})
csv_path = os.path.join(test_dir, 'graph.csv')
df.to_csv(csv_path, index=False)
meta_graph = csv_ds.MetaGraph(file_name=csv_path)
graph_data = csv_ds.GraphData.load_from_csv(
meta_graph, csv_ds.DefaultDataParser())
assert np.array_equal(df['graph_id'], graph_data.graph_id)
assert len(graph_data.data) == 1
assert np.array_equal(df['label'], graph_data.data['label'])
# add more fields into graph.csv
df = pd.DataFrame({'graph_id': np.arange(num_graphs),
'feat': np.random.randint(3, size=num_graphs),
'label': np.random.randint(3, size=num_graphs)})
csv_path = os.path.join(test_dir, 'graph.csv')
df.to_csv(csv_path, index=False)
meta_graph = csv_ds.MetaGraph(file_name=csv_path)
graph_data = csv_ds.GraphData.load_from_csv(
meta_graph, csv_ds.DefaultDataParser())
assert np.array_equal(df['graph_id'], graph_data.graph_id)
assert len(graph_data.data) == 2
assert np.array_equal(df['feat'], graph_data.data['feat'])
assert np.array_equal(df['label'], graph_data.data['label'])
# required header is missing
df = pd.DataFrame({'label': np.random.randint(3, size=num_graphs)})
csv_path = os.path.join(test_dir, 'graph.csv')
df.to_csv(csv_path, index=False)
meta_graph = csv_ds.MetaGraph(file_name=csv_path)
expect_except = False
try:
csv_ds.GraphData.load_from_csv(
meta_graph, csv_ds.DefaultDataParser())
except DGLError:
expect_except = True
assert expect_except
def _test_DGLCSVDataset_single():
with tempfile.TemporaryDirectory() as test_dir:
# generate YAML/CSVs
meta_yaml_path = os.path.join(test_dir, "meta.yaml")
edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv")
edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv")
nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv")
nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv")
meta_yaml_data = {'version': '1.0.0', 'dataset_name': 'default_name',
'node_data': [{'file_name': os.path.basename(nodes_csv_path_0),
'ntype': 'user',
},
{'file_name': os.path.basename(nodes_csv_path_1),
'ntype': 'item',
}],
'edge_data': [{'file_name': os.path.basename(edges_csv_path_0),
'etype': ['user', 'follow', 'user'],
},
{'file_name': os.path.basename(edges_csv_path_1),
'etype': ['user', 'like', 'item'],
}],
}
with open(meta_yaml_path, 'w') as f:
yaml.dump(meta_yaml_data, f, sort_keys=False)
num_nodes = 100
num_edges = 500
num_dims = 3
feat_ndata = np.random.rand(num_nodes, num_dims)
label_ndata = np.random.randint(2, size=num_nodes)
df = pd.DataFrame({'node_id': np.arange(num_nodes),
'label': label_ndata,
'feat': [line.tolist() for line in feat_ndata],
})
df.to_csv(nodes_csv_path_0, index=False)
df.to_csv(nodes_csv_path_1, index=False)
feat_edata = np.random.rand(num_edges, num_dims)
label_edata = np.random.randint(2, size=num_edges)
df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
'dst_id': np.random.randint(num_nodes, size=num_edges),
'label': label_edata,
'feat': [line.tolist() for line in feat_edata],
})
df.to_csv(edges_csv_path_0, index=False)
df.to_csv(edges_csv_path_1, index=False)
# load CSVDataset
for force_reload in [True, False]:
if not force_reload:
# remove original node data file to verify reload from cached files
os.remove(nodes_csv_path_0)
assert not os.path.exists(nodes_csv_path_0)
csv_dataset = data.DGLCSVDataset(
test_dir, force_reload=force_reload)
assert len(csv_dataset) == 1
g = csv_dataset[0]
assert not g.is_homogeneous
assert csv_dataset.has_cache()
for ntype in g.ntypes:
assert g.num_nodes(ntype) == num_nodes
assert F.array_equal(F.tensor(feat_ndata),
g.nodes[ntype].data['feat'])
assert np.array_equal(label_ndata,
F.asnumpy(g.nodes[ntype].data['label']))
for etype in g.etypes:
assert g.num_edges(etype) == num_edges
assert F.array_equal(F.tensor(feat_edata),
g.edges[etype].data['feat'])
assert np.array_equal(label_edata,
F.asnumpy(g.edges[etype].data['label']))
def _test_DGLCSVDataset_multiple():
with tempfile.TemporaryDirectory() as test_dir:
# generate YAML/CSVs
meta_yaml_path = os.path.join(test_dir, "meta.yaml")
edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv")
edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv")
nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv")
nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv")
graph_csv_path = os.path.join(test_dir, "test_graph.csv")
meta_yaml_data = {'version': '1.0.0', 'dataset_name': 'default_name',
'node_data': [{'file_name': os.path.basename(nodes_csv_path_0),
'ntype': 'user',
},
{'file_name': os.path.basename(nodes_csv_path_1),
'ntype': 'item',
}],
'edge_data': [{'file_name': os.path.basename(edges_csv_path_0),
'etype': ['user', 'follow', 'user'],
},
{'file_name': os.path.basename(edges_csv_path_1),
'etype': ['user', 'like', 'item'],
}],
'graph_data': {'file_name': os.path.basename(graph_csv_path)}
}
with open(meta_yaml_path, 'w') as f:
yaml.dump(meta_yaml_data, f, sort_keys=False)
num_nodes = 100
num_edges = 500
num_graphs = 10
num_dims = 3
feat_ndata = np.random.rand(num_nodes*num_graphs, num_dims)
label_ndata = np.random.randint(2, size=num_nodes*num_graphs)
df = pd.DataFrame({'node_id': np.hstack([np.arange(num_nodes) for _ in range(num_graphs)]),
'label': label_ndata,
'feat': [line.tolist() for line in feat_ndata],
'graph_id': np.hstack([np.full(num_nodes, i) for i in range(num_graphs)])
})
df.to_csv(nodes_csv_path_0, index=False)
df.to_csv(nodes_csv_path_1, index=False)
feat_edata = np.random.rand(num_edges*num_graphs, num_dims)
label_edata = np.random.randint(2, size=num_edges*num_graphs)
df = pd.DataFrame({'src_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
'dst_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
'label': label_edata,
'feat': [line.tolist() for line in feat_edata],
'graph_id': np.hstack([np.full(num_edges, i) for i in range(num_graphs)])
})
df.to_csv(edges_csv_path_0, index=False)
df.to_csv(edges_csv_path_1, index=False)
feat_gdata = np.random.rand(num_graphs, num_dims)
label_gdata = np.random.randint(2, size=num_graphs)
df = pd.DataFrame({'label': label_gdata,
'feat': [line.tolist() for line in feat_gdata],
'graph_id': np.arange(num_graphs)
})
df.to_csv(graph_csv_path, index=False)
# load CSVDataset with default node/edge/graph_data_parser
for force_reload in [True, False]:
if not force_reload:
# remove original node data file to verify reload from cached files
os.remove(nodes_csv_path_0)
assert not os.path.exists(nodes_csv_path_0)
csv_dataset = data.DGLCSVDataset(
test_dir, force_reload=force_reload)
assert len(csv_dataset) == num_graphs
assert csv_dataset.has_cache()
assert len(csv_dataset.data) == 2
assert 'feat' in csv_dataset.data
assert 'label' in csv_dataset.data
assert F.array_equal(F.tensor(feat_gdata),
csv_dataset.data['feat'])
for i, (g, label) in enumerate(csv_dataset):
assert not g.is_homogeneous
assert F.asnumpy(label) == label_gdata[i]
for ntype in g.ntypes:
assert g.num_nodes(ntype) == num_nodes
assert F.array_equal(F.tensor(feat_ndata[i*num_nodes:(i+1)*num_nodes]),
g.nodes[ntype].data['feat'])
assert np.array_equal(label_ndata[i*num_nodes:(i+1)*num_nodes],
F.asnumpy(g.nodes[ntype].data['label']))
for etype in g.etypes:
assert g.num_edges(etype) == num_edges
assert F.array_equal(F.tensor(feat_edata[i*num_edges:(i+1)*num_edges]),
g.edges[etype].data['feat'])
assert np.array_equal(label_edata[i*num_edges:(i+1)*num_edges],
F.asnumpy(g.edges[etype].data['label']))
def _test_DGLCSVDataset_customized_data_parser():
with tempfile.TemporaryDirectory() as test_dir:
# generate YAML/CSVs
meta_yaml_path = os.path.join(test_dir, "meta.yaml")
edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv")
edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv")
nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv")
nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv")
graph_csv_path = os.path.join(test_dir, "test_graph.csv")
meta_yaml_data = {'dataset_name': 'default_name',
'node_data': [{'file_name': os.path.basename(nodes_csv_path_0),
'ntype': 'user',
},
{'file_name': os.path.basename(nodes_csv_path_1),
'ntype': 'item',
}],
'edge_data': [{'file_name': os.path.basename(edges_csv_path_0),
'etype': ['user', 'follow', 'user'],
},
{'file_name': os.path.basename(edges_csv_path_1),
'etype': ['user', 'like', 'item'],
}],
'graph_data': {'file_name': os.path.basename(graph_csv_path)}
}
with open(meta_yaml_path, 'w') as f:
yaml.dump(meta_yaml_data, f, sort_keys=False)
num_nodes = 100
num_edges = 500
num_graphs = 10
label_ndata = np.random.randint(2, size=num_nodes*num_graphs)
df = pd.DataFrame({'node_id': np.hstack([np.arange(num_nodes) for _ in range(num_graphs)]),
'label': label_ndata,
'graph_id': np.hstack([np.full(num_nodes, i) for i in range(num_graphs)])
})
df.to_csv(nodes_csv_path_0, index=False)
df.to_csv(nodes_csv_path_1, index=False)
label_edata = np.random.randint(2, size=num_edges*num_graphs)
df = pd.DataFrame({'src_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
'dst_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
'label': label_edata,
'graph_id': np.hstack([np.full(num_edges, i) for i in range(num_graphs)])
})
df.to_csv(edges_csv_path_0, index=False)
df.to_csv(edges_csv_path_1, index=False)
label_gdata = np.random.randint(2, size=num_graphs)
df = pd.DataFrame({'label': label_gdata,
'graph_id': np.arange(num_graphs)
})
df.to_csv(graph_csv_path, index=False)
class CustDataParser:
def __call__(self, df):
data = {}
for header in df:
dt = df[header].to_numpy().squeeze()
if header == 'label':
dt += 2
data[header] = dt
return data
# load CSVDataset with customized node/edge/graph_data_parser
csv_dataset = data.DGLCSVDataset(
test_dir, node_data_parser={'user': CustDataParser()}, edge_data_parser={('user', 'like', 'item'): CustDataParser()}, graph_data_parser=CustDataParser())
assert len(csv_dataset) == num_graphs
assert len(csv_dataset.data) == 1
assert 'label' in csv_dataset.data
for i, (g, label) in enumerate(csv_dataset):
assert not g.is_homogeneous
assert F.asnumpy(label) == label_gdata[i] + 2
for ntype in g.ntypes:
assert g.num_nodes(ntype) == num_nodes
offset = 2 if ntype == 'user' else 0
assert np.array_equal(label_ndata[i*num_nodes:(i+1)*num_nodes]+offset,
F.asnumpy(g.nodes[ntype].data['label']))
for etype in g.etypes:
assert g.num_edges(etype) == num_edges
offset = 2 if etype == 'like' else 0
assert np.array_equal(label_edata[i*num_edges:(i+1)*num_edges]+offset,
F.asnumpy(g.edges[etype].data['label']))
def _test_NodeEdgeGraphData():
# NodeData basics
num_nodes = 100
node_ids = np.arange(num_nodes, dtype=np.float)
ndata = csv_ds.NodeData(node_ids, {})
assert ndata.id.dtype == np.int64
assert np.array_equal(ndata.id, node_ids.astype(np.int64))
assert len(ndata.data) == 0
assert ndata.type == '_V'
assert np.array_equal(ndata.graph_id, np.full(num_nodes, 0))
# NodeData more
data = {'feat': np.random.rand(num_nodes, 3)}
graph_id = np.arange(num_nodes)
ndata = csv_ds.NodeData(node_ids, data, type='user', graph_id=graph_id)
assert ndata.type == 'user'
assert np.array_equal(ndata.graph_id, graph_id)
assert len(ndata.data) == len(data)
for k, v in data.items():
assert k in ndata.data
assert np.array_equal(ndata.data[k], v)
# NodeData except
expect_except = False
try:
csv_ds.NodeData(np.arange(num_nodes), {'feat': np.random.rand(
num_nodes+1, 3)}, graph_id=np.arange(num_nodes-1))
except:
expect_except = True
assert expect_except
# EdgeData basics
num_nodes = 100
num_edges = 1000
src_ids = np.random.randint(num_nodes, size=num_edges)
dst_ids = np.random.randint(num_nodes, size=num_edges)
edata = csv_ds.EdgeData(src_ids, dst_ids, {})
assert np.array_equal(edata.src, src_ids)
assert np.array_equal(edata.dst, dst_ids)
assert edata.type == ('_V', '_E', '_V')
assert len(edata.data) == 0
assert np.array_equal(edata.graph_id, np.full(num_edges, 0))
# EdageData more
src_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float)
dst_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float)
data = {'feat': np.random.rand(num_edges, 3)}
etype = ('user', 'like', 'item')
graph_ids = np.arange(num_edges)
edata = csv_ds.EdgeData(src_ids, dst_ids, data,
type=etype, graph_id=graph_ids)
assert edata.src.dtype == np.int64
assert edata.dst.dtype == np.int64
assert np.array_equal(edata.src, src_ids)
assert np.array_equal(edata.dst, dst_ids)
assert edata.type == etype
assert len(edata.data) == len(data)
for k, v in data.items():
assert k in edata.data
assert np.array_equal(edata.data[k], v)
assert np.array_equal(edata.graph_id, graph_ids)
# EdgeData except
expect_except = False
try:
csv_ds.EdgeData(np.arange(num_edges), np.arange(
num_edges+1), {'feat': np.random.rand(num_edges-1, 3)}, graph_id=np.arange(num_edges+2))
except:
expect_except = True
assert expect_except
# GraphData basics
num_graphs = 10
graph_ids = np.arange(num_graphs)
gdata = csv_ds.GraphData(graph_ids, {})
assert np.array_equal(gdata.graph_id, graph_ids)
assert len(gdata.data) == 0
# GraphData more
graph_ids = np.arange(num_graphs).astype(np.float)
data = {'feat': np.random.rand(num_graphs, 3)}
gdata = csv_ds.GraphData(graph_ids, data)
assert gdata.graph_id.dtype == np.int64
assert np.array_equal(gdata.graph_id, graph_ids)
assert len(gdata.data) == len(data)
for k, v in data.items():
assert k in gdata.data
assert np.array_equal(gdata.data[k], v)
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_csvdataset():
_test_NodeEdgeGraphData()
_test_construct_graphs_homo()
_test_construct_graphs_hetero()
_test_construct_graphs_multiple()
_test_DefaultDataParser()
_test_load_yaml_with_sanity_check()
_test_load_node_data_from_csv()
_test_load_edge_data_from_csv()
_test_load_graph_data_from_csv()
_test_DGLCSVDataset_single()
_test_DGLCSVDataset_multiple()
_test_DGLCSVDataset_customized_data_parser()
if __name__ == '__main__': if __name__ == '__main__':
test_minigc() test_minigc()
test_gin() test_gin()
...@@ -164,3 +1014,4 @@ if __name__ == '__main__': ...@@ -164,3 +1014,4 @@ if __name__ == '__main__':
test_fraud() test_fraud()
test_fakenews() test_fakenews()
test_extract_archive() test_extract_archive()
test_csvdataset()
...@@ -14,7 +14,7 @@ SET DGLBACKEND=!BACKEND! ...@@ -14,7 +14,7 @@ SET DGLBACKEND=!BACKEND!
SET DGL_LIBRARY_PATH=!CD!\build SET DGL_LIBRARY_PATH=!CD!\build
SET DGL_DOWNLOAD_DIR=!CD! SET DGL_DOWNLOAD_DIR=!CD!
python -m pip install pytest || EXIT /B 1 python -m pip install pytest pyyaml pandas pydantic || EXIT /B 1
python -m pytest -v --junitxml=pytest_backend.xml tests\!DGLBACKEND! || EXIT /B 1 python -m pytest -v --junitxml=pytest_backend.xml tests\!DGLBACKEND! || EXIT /B 1
python -m pytest -v --junitxml=pytest_compute.xml tests\compute || EXIT /B 1 python -m pytest -v --junitxml=pytest_compute.xml tests\compute || EXIT /B 1
ENDLOCAL ENDLOCAL
......
...@@ -32,6 +32,7 @@ fi ...@@ -32,6 +32,7 @@ fi
conda activate ${DGLBACKEND}-ci conda activate ${DGLBACKEND}-ci
python3 -m pip install pytest pyyaml pandas pydantic || EXIT /B 1
python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute" python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute"
python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific" python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment