[Feature] add CSVDataset to load data from csv files (#3547)

* [Feature] add CSVDataset to load data from csv files * add CSVDataset class file * install pyyaml when running unit tests * install pandas for unit tests * utilize pydantic for YAML config check * generate yaml and csv files during test * make more keys as optional * remove/rename several keys in yaml config and more tets though looks a bit clumsy * fix test failure on mxnet * pass /path/to/dataset instead of yaml path * code refinement * code refine * change several yaml field such as feat and graph_id * merge graph generation logic * refine code * Refactored_first_version * DGLCSVDataset works for single heterograph * add more tests * fix test failure in mxnet * add docstring * use list comprehension for dict * fix version in YAML * refine data length assert * use dict.pop directly * remove ambiguous variable names * refine graph id missing logic * refine graph create call * separate node/edge/graph data parser * remove separator in DefaultDataParser * refine validation error log for yaml field * minor check * refine code via dict.get() * move load_from_csv into Node/Edge/GraphData * move _parse_node/edge/graph_data into Node/Edge/GraphData * refine id-related fields check * check duplicate ntypes/etypes when load yaml * refine docstring

[Feature] add CSVDataset to load data from csv files (#3547)
* [Feature] add CSVDataset to load data from csv files * add CSVDataset class file * install pyyaml when running unit tests * install pandas for unit tests * utilize pydantic for YAML config check * generate yaml and csv files during test * make more keys as optional * remove/rename several keys in yaml config and more tets though looks a bit clumsy * fix test failure on mxnet * pass /path/to/dataset instead of yaml path * code refinement * code refine * change several yaml field such as feat and graph_id * merge graph generation logic * refine code * Refactored_first_version * DGLCSVDataset works for single heterograph * add more tests * fix test failure in mxnet * add docstring * use list comprehension for dict * fix version in YAML * refine data length assert * use dict.pop directly * remove ambiguous variable names * refine graph id missing logic * refine graph create call * separate node/edge/graph data parser * remove separator in DefaultDataParser * refine validation error log for yaml field * minor check * refine code via dict.get() * move load_from_csv into Node/Edge/GraphData * move _parse_node/edge/graph_data into Node/Edge/GraphData * refine id-related fields check * check duplicate ntypes/etypes when load yaml * refine docstring
95c0ff63 · Rhett Ying · GitHub · c04b5bc7 · 95c0ff63 · 95c0ff63
Unverified Commit 95c0ff63 authored Jan 10, 2022 by Rhett Ying Committed by GitHub Jan 10, 2022
5 changed files
--- a/python/dgl/data/__init__.py
+++ b/python/dgl/data/__init__.py
@@ -29,6 +29,7 @@ from .knowledge_graph import FB15k237Dataset, FB15kDataset, WN18Dataset
 from .rdf import AIFBDataset, MUTAGDataset, BGSDataset, AMDataset
 from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
 from .fakenews import FakeNewsDataset
+from .csv_dataset import DGLCSVDataset


 def register_data_args(parser):

--- a/python/dgl/data/csv_dataset.py
+++ b/python/dgl/data/csv_dataset.py
+import os
+import yaml
+from yaml.loader import SafeLoader
+import pandas as pd
+import numpy as np
+from typing import List, Optional
+import pydantic as dt
+from .dgl_dataset import DGLDataset
+from ..convert import heterograph as dgl_heterograph
+from .. import backend as F
+from .utils import save_graphs, load_graphs
+from ..base import dgl_warning, DGLError
+import abc
+import ast
+from typing import Callable
+
+
+class MetaNode(dt.BaseModel):
+    """ Class of node_data in YAML. Internal use only. """
+    file_name: str
+    ntype: Optional[str] = '_V'
+    graph_id_field: Optional[str] = 'graph_id'
+    node_id_field: Optional[str] = 'node_id'
+
+
+class MetaEdge(dt.BaseModel):
+    """ Class of edge_data in YAML. Internal use only. """
+    file_name: str
+    etype: Optional[List[str]] = ['_V', '_E', '_V']
+    graph_id_field: Optional[str] = 'graph_id'
+    src_id_field: Optional[str] = 'src_id'
+    dst_id_field: Optional[str] = 'dst_id'
+
+
+class MetaGraph(dt.BaseModel):
+    """ Class of graph_data in YAML. Internal use only. """
+    file_name: str
+    graph_id_field: Optional[str] = 'graph_id'
+
+
+class MetaYaml(dt.BaseModel):
+    """ Class of YAML. Internal use only. """
+    version: Optional[str] = '1.0.0'
+    dataset_name: str
+    separator: Optional[str] = ','
+    node_data: List[MetaNode]
+    edge_data: List[MetaEdge]
+    graph_data: Optional[MetaGraph] = None
+
+
+def load_yaml_with_sanity_check(yaml_file):
+    """ Load yaml and do sanity check. Internal use only. """
+    with open(yaml_file) as f:
+        yaml_data = yaml.load(f, Loader=SafeLoader)
+        try:
+            meta_yaml = MetaYaml(**yaml_data)
+        except dt.ValidationError as e:
+            print(
+                "Details of pydantic.ValidationError:\n{}".format(e.json()))
+            raise DGLError(
+                "Validation Error for YAML fields. Details are shown above.")
+        if meta_yaml.version != '1.0.0':
+            raise DGLError("Invalid CSVDataset version {}. Supported versions: '1.0.0'".format(
+                meta_yaml.version))
+        ntypes = [meta.ntype for meta in meta_yaml.node_data]
+        if len(ntypes) > len(set(ntypes)):
+            raise DGLError(
+                "Each node CSV file must have a unique node type name, but found duplicate node type: {}.".format(ntypes))
+        etypes = [tuple(meta.etype) for meta in meta_yaml.edge_data]
+        if len(etypes) > len(set(etypes)):
+            raise DGLError(
+                "Each edge CSV file must have a unique edge type name, but found duplicate edge type: {}.".format(etypes))
+        return meta_yaml
+
+
+def _validate_data_length(data_dict):
+    len_dict = {k: len(v) for k, v in data_dict.items()}
+    lst = list(len_dict.values())
+    res = lst.count(lst[0]) == len(lst)
+    if not res:
+        raise DGLError(
+            "All data are required to have same length while some of them does not. Length of data={}".format(str(len_dict)))
+
+
+class BaseData:
+    """ Class of base data which is inherited by Node/Edge/GraphData. Internal use only. """
+    @staticmethod
+    def read_csv(file_name, base_dir, separator):
+        csv_path = file_name
+        if base_dir is not None:
+            csv_path = os.path.join(base_dir, csv_path)
+        return pd.read_csv(csv_path, sep=separator)
+
+    @staticmethod
+    def pop_from_dataframe(df: pd.DataFrame, item: str):
+        ret = None
+        try:
+            ret = df.pop(item).to_numpy().squeeze()
+        except KeyError:
+            pass
+        return ret
+
+
+class NodeData(BaseData):
+    """ Class of node data which is used for DGLGraph construction. Internal use only. """
+
+    def __init__(self, node_id, data, type=None, graph_id=None):
+        self.id = np.array(node_id, dtype=np.int64)
+        self.data = data
+        self.type = type if type is not None else '_V'
+        self.graph_id = np.array(graph_id, dtype=np.int) if graph_id is not None else np.full(
+            len(node_id), 0)
+        _validate_data_length({**{'id': self.id, 'graph_id': self.graph_id}, **self.data})
+
+    @staticmethod
+    def load_from_csv(meta: MetaNode, data_parser: Callable, base_dir=None, separator=','):
+        df = BaseData.read_csv(meta.file_name, base_dir, separator)
+        node_ids = BaseData.pop_from_dataframe(df, meta.node_id_field)
+        graph_ids = BaseData.pop_from_dataframe(df, meta.graph_id_field)
+        if node_ids is None:
+            raise DGLError("Missing node id field [{}] in file [{}].".format(
+                meta.node_id_field, meta.file_name))
+        ntype = meta.ntype
+        ndata = data_parser(df)
+        return NodeData(node_ids, ndata, type=ntype, graph_id=graph_ids)
+
+    @staticmethod
+    def to_dict(node_data: List['NodeData']) -> dict:
+        # node_ids could be arbitrary numeric values, namely non-sorted, duplicated, not labeled from 0 to num_nodes-1
+        node_dict = {}
+        for n_data in node_data:
+            graph_ids = np.unique(n_data.graph_id)
+            for graph_id in graph_ids:
+                idx = n_data.graph_id == graph_id
+                ids = n_data.id[idx]
+                u_ids, u_indices = np.unique(ids, return_index=True)
+                if len(ids) > len(u_ids):
+                    dgl_warning(
+                        "There exist duplicated ids and only the first ones are kept.")
+                if graph_id not in node_dict:
+                    node_dict[graph_id] = {}
+                node_dict[graph_id][n_data.type] = {'mapping': {index: i for i,
+                                                                index in enumerate(ids[u_indices])},
+                                                    'data': {k: F.tensor(v[idx][u_indices])
+                                                             for k, v in n_data.data.items()}}
+        return node_dict
+
+
+class EdgeData(BaseData):
+    """ Class of edge data which is used for DGLGraph construction. Internal use only. """
+
+    def __init__(self, src_id, dst_id, data, type=None, graph_id=None):
+        self.src = np.array(src_id, dtype=np.int64)
+        self.dst = np.array(dst_id, dtype=np.int64)
+        self.data = data
+        self.type = type if type is not None else ('_V', '_E', '_V')
+        self.graph_id = np.array(graph_id, dtype=np.int) if graph_id is not None else np.full(
+            len(src_id), 0)
+        _validate_data_length({**{'src': self.src, 'dst': self.dst, 'graph_id': self.graph_id}, **self.data})
+
+    @staticmethod
+    def load_from_csv(meta: MetaEdge, data_parser: Callable, base_dir=None, separator=','):
+        df = BaseData.read_csv(meta.file_name, base_dir, separator)
+        src_ids = BaseData.pop_from_dataframe(df, meta.src_id_field)
+        if src_ids is None:
+            raise DGLError("Missing src id field [{}] in file [{}].".format(
+                meta.src_id_field, meta.file_name))
+        dst_ids = BaseData.pop_from_dataframe(df, meta.dst_id_field)
+        if dst_ids is None:
+            raise DGLError("Missing dst id field [{}] in file [{}].".format(
+                meta.dst_id_field, meta.file_name))
+        graph_ids = BaseData.pop_from_dataframe(df, meta.graph_id_field)
+        etype = tuple(meta.etype)
+        edata = data_parser(df)
+        return EdgeData(src_ids, dst_ids, edata, type=etype, graph_id=graph_ids)
+
+    @staticmethod
+    def to_dict(edge_data: List['EdgeData'], node_dict: dict) -> dict:
+        edge_dict = {}
+        for e_data in edge_data:
+            (src_type, e_type, dst_type) = e_data.type
+            graph_ids = np.unique(e_data.graph_id)
+            for graph_id in graph_ids:
+                if graph_id in edge_dict and e_data.type in edge_dict[graph_id]:
+                    raise DGLError(f"Duplicate edge type[{e_data.type}] for same graph[{graph_id}], please place the same edge_type for same graph into single EdgeData.")
+                idx = e_data.graph_id == graph_id
+                src_mapping = node_dict[graph_id][src_type]['mapping']
+                dst_mapping = node_dict[graph_id][dst_type]['mapping']
+                src_ids = [src_mapping[index] for index in e_data.src[idx]]
+                dst_ids = [dst_mapping[index] for index in e_data.dst[idx]]
+                if graph_id not in edge_dict:
+                    edge_dict[graph_id] = {}
+                edge_dict[graph_id][e_data.type] = {'edges': (F.tensor(src_ids), F.tensor(dst_ids)),
+                                                    'data': {k: F.tensor(v[idx])
+                                                             for k, v in e_data.data.items()}}
+        return edge_dict
+
+
+class GraphData(BaseData):
+    """ Class of graph data which is used for DGLGraph construction. Internal use only. """
+
+    def __init__(self, graph_id, data):
+        self.graph_id = np.array(graph_id, dtype=np.int64)
+        self.data = data
+        _validate_data_length({**{'graph_id': self.graph_id}, **self.data})
+
+    @staticmethod
+    def load_from_csv(meta: MetaGraph, data_parser: Callable, base_dir=None, separator=','):
+        df = BaseData.read_csv(meta.file_name, base_dir, separator)
+        graph_ids = BaseData.pop_from_dataframe(df, meta.graph_id_field)
+        if graph_ids is None:
+            raise DGLError("Missing graph id field [{}] in file [{}].".format(
+                meta.graph_id_field, meta.file_name))
+        gdata = data_parser(df)
+        return GraphData(graph_ids, gdata)
+
+    @staticmethod
+    def to_dict(graph_data: 'GraphData', graphs_dict: dict) -> dict:
+        missing_ids = np.setdiff1d(
+            np.array(list(graphs_dict.keys())), graph_data.graph_id)
+        if len(missing_ids) > 0:
+            raise DGLError(
+                "Found following graph ids in node/edge CSVs but not in graph CSV: {}.".format(missing_ids))
+        graph_ids = graph_data.graph_id
+        graphs = []
+        for graph_id in graph_ids:
+            if graph_id not in graphs_dict:
+                graphs_dict[graph_id] = dgl_heterograph(
+                    {('_V', '_E', '_V'): ([], [])})
+        for graph_id in graph_ids:
+            graphs.append(graphs_dict[graph_id])
+        data = {k: F.tensor(v) for k, v in graph_data.data.items()}
+        return graphs, data
+
+
+class DGLGraphConstructor:
+    """ Class for constructing DGLGraph from Node/Edge/Graph data. Internal use only. """
+    @staticmethod
+    def construct_graphs(node_data, edge_data, graph_data=None):
+        if not isinstance(node_data, list):
+            node_data = [node_data]
+        if not isinstance(edge_data, list):
+            edge_data = [edge_data]
+        node_dict = NodeData.to_dict(node_data)
+        edge_dict = EdgeData.to_dict(edge_data, node_dict)
+        graph_dict = DGLGraphConstructor._construct_graphs(
+            node_dict, edge_dict)
+        if graph_data is None:
+            graph_data = GraphData(np.full(1, 0), {})
+        graphs, data = GraphData.to_dict(
+            graph_data, graph_dict)
+        return graphs, data
+
+    @staticmethod
+    def _construct_graphs(node_dict, edge_dict):
+        graph_dict = {}
+        for graph_id in node_dict:
+            if graph_id not in edge_dict:
+                edge_dict[graph_id][('_V', '_E', '_V')] = {'edges': ([], [])}
+            graph = dgl_heterograph({etype: edata['edges']
+                                     for etype, edata in edge_dict[graph_id].items()},
+                                    num_nodes_dict={ntype: len(ndata['mapping'])
+                                                    for ntype, ndata in node_dict[graph_id].items()})
+
+            def assign_data(type, src_data, dst_data):
+                for key, value in src_data.items():
+                    dst_data[type].data[key] = value
+            for type, data in node_dict[graph_id].items():
+                assign_data(type, data['data'], graph.nodes)
+            for (type), data in edge_dict[graph_id].items():
+                assign_data(type, data['data'], graph.edges)
+            graph_dict[graph_id] = graph
+        return graph_dict
+
+
+class DefaultDataParser:
+    """ Default data parser for DGLCSVDataset. It
+        1. ignores any columns which does not have a header.
+        2. tries to convert to list of numeric values(generated by
+            np.array().tolist()) if cell data is a str separated by ','.
+        3. read data and infer data type directly, otherwise.
+    """
+
+    def __call__(self, df: pd.DataFrame):
+        data = {}
+        for header in df:
+            if 'Unnamed' in header:
+                dgl_warning("Unamed column is found. Ignored...")
+                continue
+            dt = df[header].to_numpy().squeeze()
+            if len(dt) > 0 and isinstance(dt[0], str):
+                #probably consists of list of numeric values
+                dt = np.array([ast.literal_eval(row) for row in dt])
+            data[header] = dt
+        return data
+
+
+class DGLCSVDataset(DGLDataset):
+    """ This class aims to parse data from CSV files, construct DGLGraph
+        and behaves as a DGLDataset.
+
+    Parameters
+    ----------
+    data_path : str
+        Directory which contains 'meta.yaml' and CSV files
+    force_reload : bool, optional
+        Whether to reload the dataset. Default: False
+    verbose: bool, optional
+        Whether to print out progress information. Default: True.
+    node_data_parser : dict[str, callable], optional
+        A dictionary used for node data parsing when loading from CSV files.
+        The key is node type which specifies the header in CSV file and the
+        value is a callable object which is used to parse corresponding
+        column data. Default: None. If None, a default data parser is applied
+        which load data directly and tries to convert list into array.
+    edge_data_parser : dict[(str, str, str), callable], optional
+        A dictionary used for edge data parsing when loading from CSV files.
+        The key is edge type which specifies the header in CSV file and the
+        value is a callable object which is used to parse corresponding
+        column data. Default: None. If None, a default data parser is applied
+        which load data directly and tries to convert list into array.
+    graph_data_parser : callable, optional
+        A callable object which is used to parse corresponding column graph
+        data. Default: None. If None, a default data parser is applied
+        which load data directly and tries to convert list into array.
+
+    Attributes
+    ----------
+    graphs : :class:`dgl.DGLGraph`
+        Graphs of the dataset
+    data : dict
+        any available graph-level data such as graph-level feature, labels.
+
+    Examples
+    [TODO]: link to a detailed web page.
+    """
+    META_YAML_NAME = 'meta.yaml'
+
+    def __init__(self, data_path, force_reload=False, verbose=True, node_data_parser=None, edge_data_parser=None, graph_data_parser=None):
+        self.graphs = None
+        self.data = None
+        self.node_data_parser = {} if node_data_parser is None else node_data_parser
+        self.edge_data_parser = {} if edge_data_parser is None else edge_data_parser
+        self.graph_data_parser = graph_data_parser
+        self.default_data_parser = DefaultDataParser()
+        meta_yaml_path = os.path.join(data_path, DGLCSVDataset.META_YAML_NAME)
+        if not os.path.exists(meta_yaml_path):
+            raise DGLError(
+                "'{}' cannot be found under {}.".format(DGLCSVDataset.META_YAML_NAME, data_path))
+        self.meta_yaml = load_yaml_with_sanity_check(meta_yaml_path)
+        ds_name = self.meta_yaml.dataset_name
+        super().__init__(ds_name, raw_dir=os.path.dirname(
+            meta_yaml_path), force_reload=force_reload, verbose=verbose)
+
+    def process(self):
+        """Parse node/edge data from CSV files and construct DGL.Graphs
+        """
+        meta_yaml = self.meta_yaml
+        base_dir = self.raw_dir
+        node_data = []
+        for meta_node in meta_yaml.node_data:
+            if meta_node is None:
+                continue
+            ntype = meta_node.ntype
+            data_parser = self.node_data_parser.get(
+                ntype, self.default_data_parser)
+            ndata = NodeData.load_from_csv(
+                meta_node, base_dir=base_dir, separator=meta_yaml.separator, data_parser=data_parser)
+            node_data.append(ndata)
+        edge_data = []
+        for meta_edge in meta_yaml.edge_data:
+            if meta_edge is None:
+                continue
+            etype = tuple(meta_edge.etype)
+            data_parser = self.edge_data_parser.get(
+                etype, self.default_data_parser)
+            edata = EdgeData.load_from_csv(
+                meta_edge, base_dir=base_dir, separator=meta_yaml.separator, data_parser=data_parser)
+            edge_data.append(edata)
+        graph_data = None
+        if meta_yaml.graph_data is not None:
+            meta_graph = meta_yaml.graph_data
+            data_parser = self.default_data_parser if self.graph_data_parser is None else self.graph_data_parser
+            graph_data = GraphData.load_from_csv(
+                meta_graph, base_dir=base_dir, separator=meta_yaml.separator, data_parser=data_parser)
+        # construct graphs
+        self.graphs, self.data = DGLGraphConstructor.construct_graphs(
+            node_data, edge_data, graph_data)
+
+    def has_cache(self):
+        graph_path = os.path.join(self.save_path,
+                                  self.name + '.bin')
+        if os.path.exists(graph_path):
+            return True
+
+        return False
+
+    def save(self):
+        if self.graphs is None:
+            raise DGLError("No graphs available in dataset")
+        graph_path = os.path.join(self.save_path,
+                                  self.name + '.bin')
+        save_graphs(graph_path, self.graphs,
+                    labels=self.data)
+
+    def load(self):
+        graph_path = os.path.join(self.save_path,
+                                  self.name + '.bin')
+        self.graphs, self.data = load_graphs(graph_path)
+
+    def __getitem__(self, i):
+        if 'label' in self.data:
+            return self.graphs[i], self.data['label'][i]
+        else:
+            return self.graphs[i]
+
+    def __len__(self):
+        return len(self.graphs)
--- a/tests/compute/test_data.py
+++ b/tests/compute/test_data.py
-import dgl.data as data
 import unittest
 import backend as F
 import numpy as np
 import gzip
 import tempfile
 import os
+import pandas as pd
+import yaml
+import pytest
+import dgl.data as data
+import dgl.data.csv_dataset as csv_ds
+from dgl import DGLError


 @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
@@ -59,13 +64,15 @@ def test_tudataset_regression():
 def test_data_hash():
    class HashTestDataset(data.DGLDataset):
        def __init__(self, hash_key=()):
-            super(HashTestDataset, self).__init__('hashtest', hash_key=hash_key)
+            super(HashTestDataset, self).__init__(
+                'hashtest', hash_key=hash_key)
+
        def _load(self):
            pass

-    a = HashTestDataset((True, 0, '1', (1,2,3)))
-    b = HashTestDataset((True, 0, '1', (1,2,3)))
-    c = HashTestDataset((True, 0, '1', (1,2,4)))
+    a = HashTestDataset((True, 0, '1', (1, 2, 3)))
+    b = HashTestDataset((True, 0, '1', (1, 2, 3)))
+    c = HashTestDataset((True, 0, '1', (1, 2, 4)))
    assert a.hash == b.hash
    assert a.hash != c.hash

@@ -156,6 +163,849 @@ def test_extract_archive():
            assert os.path.exists(os.path.join(dst_dir, gz_file))


+def _test_construct_graphs_homo():
+    # node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1
+    num_nodes = 100
+    num_edges = 1000
+    num_dims = 3
+    num_dup_nodes = int(num_nodes*0.2)
+    node_ids = np.random.choice(
+        np.arange(num_nodes*2), size=num_nodes, replace=False)
+    assert len(node_ids) == num_nodes
+    np.random.shuffle(node_ids)
+    node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes]))
+    t_ndata = {'feat': np.random.rand(num_nodes+num_dup_nodes, num_dims),
+               'label': np.random.randint(2, size=num_nodes+num_dup_nodes)}
+    _, u_indices = np.unique(node_ids, return_index=True)
+    ndata = {'feat': t_ndata['feat'][u_indices],
+             'label': t_ndata['label'][u_indices]}
+    node_data = csv_ds.NodeData(node_ids, t_ndata)
+    src_ids = np.random.choice(node_ids, size=num_edges)
+    dst_ids = np.random.choice(node_ids, size=num_edges)
+    edata = {'feat': np.random.rand(
+        num_edges, num_dims), 'label': np.random.randint(2, size=num_edges)}
+    edge_data = csv_ds.EdgeData(src_ids, dst_ids, edata)
+    graphs, data_dict = csv_ds.DGLGraphConstructor.construct_graphs(
+        node_data, edge_data)
+    assert len(graphs) == 1
+    assert len(data_dict) == 0
+    g = graphs[0]
+    assert g.is_homogeneous
+    assert g.num_nodes() == num_nodes
+    assert g.num_edges() == num_edges
+
+    def assert_data(lhs, rhs):
+        for key, value in lhs.items():
+            assert key in rhs
+            assert F.array_equal(F.tensor(value), rhs[key])
+    assert_data(ndata, g.ndata)
+    assert_data(edata, g.edata)
+
+
+def _test_construct_graphs_hetero():
+    # node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1
+    num_nodes = 100
+    num_edges = 1000
+    num_dims = 3
+    num_dup_nodes = int(num_nodes*0.2)
+    ntypes = ['user', 'item']
+    node_data = []
+    node_ids_dict = {}
+    ndata_dict = {}
+    for ntype in ntypes:
+        node_ids = np.random.choice(
+            np.arange(num_nodes*2), size=num_nodes, replace=False)
+        assert len(node_ids) == num_nodes
+        np.random.shuffle(node_ids)
+        node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes]))
+        t_ndata = {'feat': np.random.rand(num_nodes+num_dup_nodes, num_dims),
+                   'label': np.random.randint(2, size=num_nodes+num_dup_nodes)}
+        _, u_indices = np.unique(node_ids, return_index=True)
+        ndata = {'feat': t_ndata['feat'][u_indices],
+                 'label': t_ndata['label'][u_indices]}
+        node_data.append(csv_ds.NodeData(node_ids, t_ndata, type=ntype))
+        node_ids_dict[ntype] = node_ids
+        ndata_dict[ntype] = ndata
+    etypes = [('user', 'follow', 'user'), ('user', 'like', 'item')]
+    edge_data = []
+    edata_dict = {}
+    for src_type, e_type, dst_type in etypes:
+        src_ids = np.random.choice(node_ids_dict[src_type], size=num_edges)
+        dst_ids = np.random.choice(node_ids_dict[dst_type], size=num_edges)
+        edata = {'feat': np.random.rand(
+            num_edges, num_dims), 'label': np.random.randint(2, size=num_edges)}
+        edge_data.append(csv_ds.EdgeData(src_ids, dst_ids, edata,
+                         type=(src_type, e_type, dst_type)))
+        edata_dict[(src_type, e_type, dst_type)] = edata
+    graphs, data_dict = csv_ds.DGLGraphConstructor.construct_graphs(
+        node_data, edge_data)
+    assert len(graphs) == 1
+    assert len(data_dict) == 0
+    g = graphs[0]
+    assert not g.is_homogeneous
+    assert g.num_nodes() == num_nodes*len(ntypes)
+    assert g.num_edges() == num_edges*len(etypes)
+
+    def assert_data(lhs, rhs):
+        for key, value in lhs.items():
+            assert key in rhs
+            assert F.array_equal(F.tensor(value), rhs[key])
+    for ntype in g.ntypes:
+        assert g.num_nodes(ntype) == num_nodes
+        assert_data(ndata_dict[ntype], g.nodes[ntype].data)
+    for etype in g.canonical_etypes:
+        assert g.num_edges(etype) == num_edges
+        assert_data(edata_dict[etype], g.edges[etype].data)
+
+
+def _test_construct_graphs_multiple():
+    num_nodes = 100
+    num_edges = 1000
+    num_graphs = 10
+    num_dims = 3
+    node_ids = np.array([], dtype=np.int)
+    src_ids = np.array([], dtype=np.int)
+    dst_ids = np.array([], dtype=np.int)
+    ngraph_ids = np.array([], dtype=np.int)
+    egraph_ids = np.array([], dtype=np.int)
+    u_indices = np.array([], dtype=np.int)
+    for i in range(num_graphs):
+        l_node_ids = np.random.choice(
+            np.arange(num_nodes*2), size=num_nodes, replace=False)
+        node_ids = np.append(node_ids, l_node_ids)
+        _, l_u_indices = np.unique(l_node_ids, return_index=True)
+        u_indices = np.append(u_indices, l_u_indices)
+        ngraph_ids = np.append(ngraph_ids, np.full(num_nodes, i))
+        src_ids = np.append(src_ids, np.random.choice(
+            l_node_ids, size=num_edges))
+        dst_ids = np.append(dst_ids, np.random.choice(
+            l_node_ids, size=num_edges))
+        egraph_ids = np.append(egraph_ids, np.full(num_edges, i))
+    ndata = {'feat': np.random.rand(num_nodes*num_graphs, num_dims),
+             'label': np.random.randint(2, size=num_nodes*num_graphs)}
+    node_data = csv_ds.NodeData(node_ids, ndata, graph_id=ngraph_ids)
+    edata = {'feat': np.random.rand(
+        num_edges*num_graphs, num_dims), 'label': np.random.randint(2, size=num_edges*num_graphs)}
+    edge_data = csv_ds.EdgeData(src_ids, dst_ids, edata, graph_id=egraph_ids)
+    gdata = {'feat': np.random.rand(num_graphs, num_dims),
+             'label': np.random.randint(2, size=num_graphs)}
+    graph_data = csv_ds.GraphData(np.arange(num_graphs), gdata)
+    graphs, data_dict = csv_ds.DGLGraphConstructor.construct_graphs(
+        node_data, edge_data, graph_data)
+    assert len(graphs) == num_graphs
+    assert len(data_dict) == len(gdata)
+    for k, v in data_dict.items():
+        assert F.array_equal(F.tensor(gdata[k]), v)
+    for i, g in enumerate(graphs):
+        assert g.is_homogeneous
+        assert g.num_nodes() == num_nodes
+        assert g.num_edges() == num_edges
+
+        def assert_data(lhs, rhs, size, node=False):
+            for key, value in lhs.items():
+                assert key in rhs
+                value = value[i*size:(i+1)*size]
+                if node:
+                    indices = u_indices[i*size:(i+1)*size]
+                    value = value[indices]
+                assert F.array_equal(F.tensor(value), rhs[key])
+        assert_data(ndata, g.ndata, num_nodes, node=True)
+        assert_data(edata, g.edata, num_edges)
+
+    # Graph IDs found in node/edge CSV but not in graph CSV
+    graph_data = csv_ds.GraphData(np.arange(num_graphs-2), {})
+    expect_except = False
+    try:
+        _, _ = csv_ds.DGLGraphConstructor.construct_graphs(
+            node_data, edge_data, graph_data)
+    except:
+        expect_except = True
+    assert expect_except
+
+
+def _test_DefaultDataParser():
+    # common csv
+    with tempfile.TemporaryDirectory() as test_dir:
+        csv_path = os.path.join(test_dir, "nodes.csv")
+        num_nodes = 5
+        num_labels = 3
+        num_dims = 2
+        node_id = np.arange(num_nodes)
+        label = np.random.randint(num_labels, size=num_nodes)
+        feat = np.random.rand(num_nodes, num_dims)
+        df = pd.DataFrame({'node_id': node_id, 'label': label,
+                           'feat': [line.tolist() for line in feat],
+                           })
+        df.to_csv(csv_path, index=False)
+        dp = csv_ds.DefaultDataParser()
+        df = pd.read_csv(csv_path)
+        dt = dp(df)
+        assert np.array_equal(node_id, dt['node_id'])
+        assert np.array_equal(label, dt['label'])
+        assert np.array_equal(feat, dt['feat'])
+    # string consists of non-numeric values
+    with tempfile.TemporaryDirectory() as test_dir:
+        csv_path = os.path.join(test_dir, "nodes.csv")
+        df = pd.DataFrame({'label': ['a', 'b', 'c'],
+                           })
+        df.to_csv(csv_path, index=False)
+        dp = csv_ds.DefaultDataParser()
+        df = pd.read_csv(csv_path)
+        expect_except = False
+        try:
+            dt = dp(df)
+        except:
+            expect_except = True
+        assert expect_except
+    # csv has index column which is ignored as it's unnamed
+    with tempfile.TemporaryDirectory() as test_dir:
+        csv_path = os.path.join(test_dir, "nodes.csv")
+        df = pd.DataFrame({'label': [1, 2, 3],
+                           })
+        df.to_csv(csv_path)
+        dp = csv_ds.DefaultDataParser()
+        df = pd.read_csv(csv_path)
+        dt = dp(df)
+        assert len(dt) == 1
+
+
+def _test_load_yaml_with_sanity_check():
+    with tempfile.TemporaryDirectory() as test_dir:
+        yaml_path = os.path.join(test_dir, 'meta.yaml')
+        # workable but meaningless usually
+        yaml_data = {'dataset_name': 'default',
+                     'node_data': [], 'edge_data': []}
+        with open(yaml_path, 'w') as f:
+            yaml.dump(yaml_data, f, sort_keys=False)
+        meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+        assert meta.version == '1.0.0'
+        assert meta.dataset_name == 'default'
+        assert meta.separator == ','
+        assert len(meta.node_data) == 0
+        assert len(meta.edge_data) == 0
+        assert meta.graph_data is None
+        # minimum with required fields only
+        yaml_data = {'version': '1.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes.csv'}],
+                     'edge_data': [{'file_name': 'edges.csv'}],
+                     }
+        with open(yaml_path, 'w') as f:
+            yaml.dump(yaml_data, f, sort_keys=False)
+        meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+        for ndata in meta.node_data:
+            assert ndata.file_name == 'nodes.csv'
+            assert ndata.ntype == '_V'
+            assert ndata.graph_id_field == 'graph_id'
+            assert ndata.node_id_field == 'node_id'
+        for edata in meta.edge_data:
+            assert edata.file_name == 'edges.csv'
+            assert edata.etype == ['_V', '_E', '_V']
+            assert edata.graph_id_field == 'graph_id'
+            assert edata.src_id_field == 'src_id'
+            assert edata.dst_id_field == 'dst_id'
+        # optional fields are specified
+        yaml_data = {'version': '1.0.0', 'dataset_name': 'default',
+                     'separator': '|',
+                     'node_data': [{'file_name': 'nodes.csv', 'ntype': 'user', 'graph_id_field': 'xxx', 'node_id_field': 'xxx'}],
+                     'edge_data': [{'file_name': 'edges.csv', 'etype': ['user', 'follow', 'user'], 'graph_id_field':'xxx', 'src_id_field':'xxx', 'dst_id_field':'xxx'}],
+                     'graph_data': {'file_name': 'graph.csv', 'graph_id_field': 'xxx'}
+                     }
+        with open(yaml_path, 'w') as f:
+            yaml.dump(yaml_data, f, sort_keys=False)
+        meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+        assert len(meta.node_data) == 1
+        ndata = meta.node_data[0]
+        assert ndata.ntype == 'user'
+        assert ndata.graph_id_field == 'xxx'
+        assert ndata.node_id_field == 'xxx'
+        assert len(meta.edge_data) == 1
+        edata = meta.edge_data[0]
+        assert edata.etype == ['user', 'follow', 'user']
+        assert edata.graph_id_field == 'xxx'
+        assert edata.src_id_field == 'xxx'
+        assert edata.dst_id_field == 'xxx'
+        assert meta.graph_data is not None
+        assert meta.graph_data.file_name == 'graph.csv'
+        assert meta.graph_data.graph_id_field == 'xxx'
+        # some required fields are missing
+        yaml_data = {'dataset_name': 'default',
+                     'node_data': [], 'edge_data': []}
+        for field in yaml_data.keys():
+            ydata = {k: v for k, v in yaml_data.items()}
+            ydata.pop(field)
+            with open(yaml_path, 'w') as f:
+                yaml.dump(ydata, f, sort_keys=False)
+            expect_except = False
+            try:
+                meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+            except:
+                expect_except = True
+            assert expect_except
+        # inapplicable version
+        yaml_data = {'version': '0.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes_0.csv'}],
+                     'edge_data': [{'file_name': 'edges_0.csv'}],
+                     }
+        with open(yaml_path, 'w') as f:
+            yaml.dump(yaml_data, f, sort_keys=False)
+        expect_except = False
+        try:
+            meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+        except DGLError:
+            expect_except = True
+        assert expect_except
+        # duplicate node types
+        yaml_data = {'version': '1.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes.csv'}, {'file_name': 'nodes.csv'}],
+                     'edge_data': [{'file_name': 'edges.csv'}],
+                     }
+        with open(yaml_path, 'w') as f:
+            yaml.dump(yaml_data, f, sort_keys=False)
+        expect_except = False
+        try:
+            meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+        except DGLError:
+            expect_except = True
+        assert expect_except
+        # duplicate edge types
+        yaml_data = {'version': '1.0.0', 'dataset_name': 'default', 'node_data': [{'file_name': 'nodes.csv'}],
+                     'edge_data': [{'file_name': 'edges.csv'}, {'file_name': 'edges.csv'}],
+                     }
+        with open(yaml_path, 'w') as f:
+            yaml.dump(yaml_data, f, sort_keys=False)
+        expect_except = False
+        try:
+            meta = csv_ds.load_yaml_with_sanity_check(yaml_path)
+        except DGLError:
+            expect_except = True
+        assert expect_except
+
+
+def _test_load_node_data_from_csv():
+    with tempfile.TemporaryDirectory() as test_dir:
+        num_nodes = 100
+        # minimum
+        df = pd.DataFrame({'node_id': np.arange(num_nodes)})
+        csv_path = os.path.join(test_dir, 'nodes.csv')
+        df.to_csv(csv_path, index=False)
+        meta_node = csv_ds.MetaNode(file_name=csv_path)
+        node_data = csv_ds.NodeData.load_from_csv(
+            meta_node, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['node_id'], node_data.id)
+        assert len(node_data.data) == 0
+
+        # common case
+        df = pd.DataFrame({'node_id': np.arange(num_nodes),
+                          'label': np.random.randint(3, size=num_nodes)})
+        csv_path = os.path.join(test_dir, 'nodes.csv')
+        df.to_csv(csv_path, index=False)
+        meta_node = csv_ds.MetaNode(file_name=csv_path)
+        node_data = csv_ds.NodeData.load_from_csv(
+            meta_node, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['node_id'], node_data.id)
+        assert len(node_data.data) == 1
+        assert np.array_equal(df['label'], node_data.data['label'])
+        assert np.array_equal(np.full(num_nodes, 0), node_data.graph_id)
+        assert node_data.type == '_V'
+
+        # add more fields into nodes.csv
+        df = pd.DataFrame({'node_id': np.arange(num_nodes), 'label': np.random.randint(
+            3, size=num_nodes), 'graph_id': np.full(num_nodes, 1)})
+        csv_path = os.path.join(test_dir, 'nodes.csv')
+        df.to_csv(csv_path, index=False)
+        meta_node = csv_ds.MetaNode(file_name=csv_path)
+        node_data = csv_ds.NodeData.load_from_csv(
+            meta_node, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['node_id'], node_data.id)
+        assert len(node_data.data) == 1
+        assert np.array_equal(df['label'], node_data.data['label'])
+        assert np.array_equal(df['graph_id'], node_data.graph_id)
+        assert node_data.type == '_V'
+
+        # required header is missing
+        df = pd.DataFrame({'label': np.random.randint(3, size=num_nodes)})
+        csv_path = os.path.join(test_dir, 'nodes.csv')
+        df.to_csv(csv_path, index=False)
+        meta_node = csv_ds.MetaNode(file_name=csv_path)
+        expect_except = False
+        try:
+            csv_ds.NodeData.load_from_csv(
+                meta_node, csv_ds.DefaultDataParser())
+        except:
+            expect_except = True
+        assert expect_except
+
+
+def _test_load_edge_data_from_csv():
+    with tempfile.TemporaryDirectory() as test_dir:
+        num_nodes = 100
+        num_edges = 1000
+        # minimum
+        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
+                           'dst_id': np.random.randint(num_nodes, size=num_edges),
+                           })
+        csv_path = os.path.join(test_dir, 'edges.csv')
+        df.to_csv(csv_path, index=False)
+        meta_edge = csv_ds.MetaEdge(file_name=csv_path)
+        edge_data = csv_ds.EdgeData.load_from_csv(
+            meta_edge, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['src_id'], edge_data.src)
+        assert np.array_equal(df['dst_id'], edge_data.dst)
+        assert len(edge_data.data) == 0
+
+        # common case
+        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
+                           'dst_id': np.random.randint(num_nodes, size=num_edges),
+                           'label': np.random.randint(3, size=num_edges)})
+        csv_path = os.path.join(test_dir, 'edges.csv')
+        df.to_csv(csv_path, index=False)
+        meta_edge = csv_ds.MetaEdge(file_name=csv_path)
+        edge_data = csv_ds.EdgeData.load_from_csv(
+            meta_edge, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['src_id'], edge_data.src)
+        assert np.array_equal(df['dst_id'], edge_data.dst)
+        assert len(edge_data.data) == 1
+        assert np.array_equal(df['label'], edge_data.data['label'])
+        assert np.array_equal(np.full(num_edges, 0), edge_data.graph_id)
+        assert edge_data.type == ('_V', '_E', '_V')
+
+        # add more fields into edges.csv
+        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
+                           'dst_id': np.random.randint(num_nodes, size=num_edges),
+                           'graph_id': np.arange(num_edges),
+                           'feat': np.random.randint(3, size=num_edges),
+                           'label': np.random.randint(3, size=num_edges)})
+        csv_path = os.path.join(test_dir, 'edges.csv')
+        df.to_csv(csv_path, index=False)
+        meta_edge = csv_ds.MetaEdge(file_name=csv_path)
+        edge_data = csv_ds.EdgeData.load_from_csv(
+            meta_edge, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['src_id'], edge_data.src)
+        assert np.array_equal(df['dst_id'], edge_data.dst)
+        assert len(edge_data.data) == 2
+        assert np.array_equal(df['feat'], edge_data.data['feat'])
+        assert np.array_equal(df['label'], edge_data.data['label'])
+        assert np.array_equal(df['graph_id'], edge_data.graph_id)
+        assert edge_data.type == ('_V', '_E', '_V')
+
+        # required headers are missing
+        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
+                           })
+        csv_path = os.path.join(test_dir, 'edges.csv')
+        df.to_csv(csv_path, index=False)
+        meta_edge = csv_ds.MetaEdge(file_name=csv_path)
+        expect_except = False
+        try:
+            csv_ds.EdgeData.load_from_csv(
+                meta_edge, csv_ds.DefaultDataParser())
+        except DGLError:
+            expect_except = True
+        assert expect_except
+        df = pd.DataFrame({'dst_id': np.random.randint(num_nodes, size=num_edges),
+                           })
+        csv_path = os.path.join(test_dir, 'edges.csv')
+        df.to_csv(csv_path, index=False)
+        meta_edge = csv_ds.MetaEdge(file_name=csv_path)
+        expect_except = False
+        try:
+            csv_ds.EdgeData.load_from_csv(
+                meta_edge, csv_ds.DefaultDataParser())
+        except DGLError:
+            expect_except = True
+        assert expect_except
+
+
+def _test_load_graph_data_from_csv():
+    with tempfile.TemporaryDirectory() as test_dir:
+        num_graphs = 100
+        # minimum
+        df = pd.DataFrame({'graph_id': np.arange(num_graphs)})
+        csv_path = os.path.join(test_dir, 'graph.csv')
+        df.to_csv(csv_path, index=False)
+        meta_graph = csv_ds.MetaGraph(file_name=csv_path)
+        graph_data = csv_ds.GraphData.load_from_csv(
+            meta_graph, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['graph_id'], graph_data.graph_id)
+        assert len(graph_data.data) == 0
+
+        # common case
+        df = pd.DataFrame({'graph_id': np.arange(num_graphs),
+                          'label': np.random.randint(3, size=num_graphs)})
+        csv_path = os.path.join(test_dir, 'graph.csv')
+        df.to_csv(csv_path, index=False)
+        meta_graph = csv_ds.MetaGraph(file_name=csv_path)
+        graph_data = csv_ds.GraphData.load_from_csv(
+            meta_graph, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['graph_id'], graph_data.graph_id)
+        assert len(graph_data.data) == 1
+        assert np.array_equal(df['label'], graph_data.data['label'])
+
+        # add more fields into graph.csv
+        df = pd.DataFrame({'graph_id': np.arange(num_graphs),
+                           'feat': np.random.randint(3, size=num_graphs),
+                           'label': np.random.randint(3, size=num_graphs)})
+        csv_path = os.path.join(test_dir, 'graph.csv')
+        df.to_csv(csv_path, index=False)
+        meta_graph = csv_ds.MetaGraph(file_name=csv_path)
+        graph_data = csv_ds.GraphData.load_from_csv(
+            meta_graph, csv_ds.DefaultDataParser())
+        assert np.array_equal(df['graph_id'], graph_data.graph_id)
+        assert len(graph_data.data) == 2
+        assert np.array_equal(df['feat'], graph_data.data['feat'])
+        assert np.array_equal(df['label'], graph_data.data['label'])
+
+        # required header is missing
+        df = pd.DataFrame({'label': np.random.randint(3, size=num_graphs)})
+        csv_path = os.path.join(test_dir, 'graph.csv')
+        df.to_csv(csv_path, index=False)
+        meta_graph = csv_ds.MetaGraph(file_name=csv_path)
+        expect_except = False
+        try:
+            csv_ds.GraphData.load_from_csv(
+                meta_graph, csv_ds.DefaultDataParser())
+        except DGLError:
+            expect_except = True
+        assert expect_except
+
+
+def _test_DGLCSVDataset_single():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # generate YAML/CSVs
+        meta_yaml_path = os.path.join(test_dir, "meta.yaml")
+        edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv")
+        edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv")
+        nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv")
+        nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv")
+        meta_yaml_data = {'version': '1.0.0', 'dataset_name': 'default_name',
+                          'node_data': [{'file_name': os.path.basename(nodes_csv_path_0),
+                                         'ntype': 'user',
+                                         },
+                                        {'file_name': os.path.basename(nodes_csv_path_1),
+                                            'ntype': 'item',
+                                         }],
+                          'edge_data': [{'file_name': os.path.basename(edges_csv_path_0),
+                                         'etype': ['user', 'follow', 'user'],
+                                         },
+                                        {'file_name': os.path.basename(edges_csv_path_1),
+                                         'etype': ['user', 'like', 'item'],
+                                         }],
+                          }
+        with open(meta_yaml_path, 'w') as f:
+            yaml.dump(meta_yaml_data, f, sort_keys=False)
+        num_nodes = 100
+        num_edges = 500
+        num_dims = 3
+        feat_ndata = np.random.rand(num_nodes, num_dims)
+        label_ndata = np.random.randint(2, size=num_nodes)
+        df = pd.DataFrame({'node_id': np.arange(num_nodes),
+                           'label': label_ndata,
+                           'feat': [line.tolist() for line in feat_ndata],
+                           })
+        df.to_csv(nodes_csv_path_0, index=False)
+        df.to_csv(nodes_csv_path_1, index=False)
+        feat_edata = np.random.rand(num_edges, num_dims)
+        label_edata = np.random.randint(2, size=num_edges)
+        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
+                           'dst_id': np.random.randint(num_nodes, size=num_edges),
+                           'label': label_edata,
+                           'feat': [line.tolist() for line in feat_edata],
+                           })
+        df.to_csv(edges_csv_path_0, index=False)
+        df.to_csv(edges_csv_path_1, index=False)
+
+        # load CSVDataset
+        for force_reload in [True, False]:
+            if not force_reload:
+                # remove original node data file to verify reload from cached files
+                os.remove(nodes_csv_path_0)
+                assert not os.path.exists(nodes_csv_path_0)
+            csv_dataset = data.DGLCSVDataset(
+                test_dir, force_reload=force_reload)
+            assert len(csv_dataset) == 1
+            g = csv_dataset[0]
+            assert not g.is_homogeneous
+            assert csv_dataset.has_cache()
+            for ntype in g.ntypes:
+                assert g.num_nodes(ntype) == num_nodes
+                assert F.array_equal(F.tensor(feat_ndata),
+                                     g.nodes[ntype].data['feat'])
+                assert np.array_equal(label_ndata,
+                                      F.asnumpy(g.nodes[ntype].data['label']))
+            for etype in g.etypes:
+                assert g.num_edges(etype) == num_edges
+                assert F.array_equal(F.tensor(feat_edata),
+                                     g.edges[etype].data['feat'])
+                assert np.array_equal(label_edata,
+                                      F.asnumpy(g.edges[etype].data['label']))
+
+
+def _test_DGLCSVDataset_multiple():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # generate YAML/CSVs
+        meta_yaml_path = os.path.join(test_dir, "meta.yaml")
+        edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv")
+        edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv")
+        nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv")
+        nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv")
+        graph_csv_path = os.path.join(test_dir, "test_graph.csv")
+        meta_yaml_data = {'version': '1.0.0', 'dataset_name': 'default_name',
+                          'node_data': [{'file_name': os.path.basename(nodes_csv_path_0),
+                                         'ntype': 'user',
+                                         },
+                                        {'file_name': os.path.basename(nodes_csv_path_1),
+                                            'ntype': 'item',
+                                         }],
+                          'edge_data': [{'file_name': os.path.basename(edges_csv_path_0),
+                                         'etype': ['user', 'follow', 'user'],
+                                         },
+                                        {'file_name': os.path.basename(edges_csv_path_1),
+                                         'etype': ['user', 'like', 'item'],
+                                         }],
+                          'graph_data': {'file_name': os.path.basename(graph_csv_path)}
+                          }
+        with open(meta_yaml_path, 'w') as f:
+            yaml.dump(meta_yaml_data, f, sort_keys=False)
+        num_nodes = 100
+        num_edges = 500
+        num_graphs = 10
+        num_dims = 3
+        feat_ndata = np.random.rand(num_nodes*num_graphs, num_dims)
+        label_ndata = np.random.randint(2, size=num_nodes*num_graphs)
+        df = pd.DataFrame({'node_id': np.hstack([np.arange(num_nodes) for _ in range(num_graphs)]),
+                           'label': label_ndata,
+                           'feat': [line.tolist() for line in feat_ndata],
+                           'graph_id': np.hstack([np.full(num_nodes, i) for i in range(num_graphs)])
+                           })
+        df.to_csv(nodes_csv_path_0, index=False)
+        df.to_csv(nodes_csv_path_1, index=False)
+        feat_edata = np.random.rand(num_edges*num_graphs, num_dims)
+        label_edata = np.random.randint(2, size=num_edges*num_graphs)
+        df = pd.DataFrame({'src_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
+                           'dst_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
+                           'label': label_edata,
+                           'feat': [line.tolist() for line in feat_edata],
+                           'graph_id': np.hstack([np.full(num_edges, i) for i in range(num_graphs)])
+                           })
+        df.to_csv(edges_csv_path_0, index=False)
+        df.to_csv(edges_csv_path_1, index=False)
+        feat_gdata = np.random.rand(num_graphs, num_dims)
+        label_gdata = np.random.randint(2, size=num_graphs)
+        df = pd.DataFrame({'label': label_gdata,
+                           'feat': [line.tolist() for line in feat_gdata],
+                           'graph_id': np.arange(num_graphs)
+                           })
+        df.to_csv(graph_csv_path, index=False)
+
+        # load CSVDataset with default node/edge/graph_data_parser
+        for force_reload in [True, False]:
+            if not force_reload:
+                # remove original node data file to verify reload from cached files
+                os.remove(nodes_csv_path_0)
+                assert not os.path.exists(nodes_csv_path_0)
+            csv_dataset = data.DGLCSVDataset(
+                test_dir, force_reload=force_reload)
+            assert len(csv_dataset) == num_graphs
+            assert csv_dataset.has_cache()
+            assert len(csv_dataset.data) == 2
+            assert 'feat' in csv_dataset.data
+            assert 'label' in csv_dataset.data
+            assert F.array_equal(F.tensor(feat_gdata),
+                                 csv_dataset.data['feat'])
+            for i, (g, label) in enumerate(csv_dataset):
+                assert not g.is_homogeneous
+                assert F.asnumpy(label) == label_gdata[i]
+                for ntype in g.ntypes:
+                    assert g.num_nodes(ntype) == num_nodes
+                    assert F.array_equal(F.tensor(feat_ndata[i*num_nodes:(i+1)*num_nodes]),
+                                         g.nodes[ntype].data['feat'])
+                    assert np.array_equal(label_ndata[i*num_nodes:(i+1)*num_nodes],
+                                          F.asnumpy(g.nodes[ntype].data['label']))
+                for etype in g.etypes:
+                    assert g.num_edges(etype) == num_edges
+                    assert F.array_equal(F.tensor(feat_edata[i*num_edges:(i+1)*num_edges]),
+                                         g.edges[etype].data['feat'])
+                    assert np.array_equal(label_edata[i*num_edges:(i+1)*num_edges],
+                                          F.asnumpy(g.edges[etype].data['label']))
+
+
+def _test_DGLCSVDataset_customized_data_parser():
+    with tempfile.TemporaryDirectory() as test_dir:
+        # generate YAML/CSVs
+        meta_yaml_path = os.path.join(test_dir, "meta.yaml")
+        edges_csv_path_0 = os.path.join(test_dir, "test_edges_0.csv")
+        edges_csv_path_1 = os.path.join(test_dir, "test_edges_1.csv")
+        nodes_csv_path_0 = os.path.join(test_dir, "test_nodes_0.csv")
+        nodes_csv_path_1 = os.path.join(test_dir, "test_nodes_1.csv")
+        graph_csv_path = os.path.join(test_dir, "test_graph.csv")
+        meta_yaml_data = {'dataset_name': 'default_name',
+                          'node_data': [{'file_name': os.path.basename(nodes_csv_path_0),
+                                         'ntype': 'user',
+                                         },
+                                        {'file_name': os.path.basename(nodes_csv_path_1),
+                                            'ntype': 'item',
+                                         }],
+                          'edge_data': [{'file_name': os.path.basename(edges_csv_path_0),
+                                         'etype': ['user', 'follow', 'user'],
+                                         },
+                                        {'file_name': os.path.basename(edges_csv_path_1),
+                                         'etype': ['user', 'like', 'item'],
+                                         }],
+                          'graph_data': {'file_name': os.path.basename(graph_csv_path)}
+                          }
+        with open(meta_yaml_path, 'w') as f:
+            yaml.dump(meta_yaml_data, f, sort_keys=False)
+        num_nodes = 100
+        num_edges = 500
+        num_graphs = 10
+        label_ndata = np.random.randint(2, size=num_nodes*num_graphs)
+        df = pd.DataFrame({'node_id': np.hstack([np.arange(num_nodes) for _ in range(num_graphs)]),
+                           'label': label_ndata,
+                           'graph_id': np.hstack([np.full(num_nodes, i) for i in range(num_graphs)])
+                           })
+        df.to_csv(nodes_csv_path_0, index=False)
+        df.to_csv(nodes_csv_path_1, index=False)
+        label_edata = np.random.randint(2, size=num_edges*num_graphs)
+        df = pd.DataFrame({'src_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
+                           'dst_id': np.hstack([np.random.randint(num_nodes, size=num_edges) for _ in range(num_graphs)]),
+                           'label': label_edata,
+                           'graph_id': np.hstack([np.full(num_edges, i) for i in range(num_graphs)])
+                           })
+        df.to_csv(edges_csv_path_0, index=False)
+        df.to_csv(edges_csv_path_1, index=False)
+        label_gdata = np.random.randint(2, size=num_graphs)
+        df = pd.DataFrame({'label': label_gdata,
+                           'graph_id': np.arange(num_graphs)
+                           })
+        df.to_csv(graph_csv_path, index=False)
+
+        class CustDataParser:
+            def __call__(self, df):
+                data = {}
+                for header in df:
+                    dt = df[header].to_numpy().squeeze()
+                    if header == 'label':
+                        dt += 2
+                    data[header] = dt
+                return data
+        # load CSVDataset with customized node/edge/graph_data_parser
+        csv_dataset = data.DGLCSVDataset(
+            test_dir, node_data_parser={'user': CustDataParser()}, edge_data_parser={('user', 'like', 'item'): CustDataParser()}, graph_data_parser=CustDataParser())
+        assert len(csv_dataset) == num_graphs
+        assert len(csv_dataset.data) == 1
+        assert 'label' in csv_dataset.data
+        for i, (g, label) in enumerate(csv_dataset):
+            assert not g.is_homogeneous
+            assert F.asnumpy(label) == label_gdata[i] + 2
+            for ntype in g.ntypes:
+                assert g.num_nodes(ntype) == num_nodes
+                offset = 2 if ntype == 'user' else 0
+                assert np.array_equal(label_ndata[i*num_nodes:(i+1)*num_nodes]+offset,
+                                      F.asnumpy(g.nodes[ntype].data['label']))
+            for etype in g.etypes:
+                assert g.num_edges(etype) == num_edges
+                offset = 2 if etype == 'like' else 0
+                assert np.array_equal(label_edata[i*num_edges:(i+1)*num_edges]+offset,
+                                      F.asnumpy(g.edges[etype].data['label']))
+
+
+def _test_NodeEdgeGraphData():
+    # NodeData basics
+    num_nodes = 100
+    node_ids = np.arange(num_nodes, dtype=np.float)
+    ndata = csv_ds.NodeData(node_ids, {})
+    assert ndata.id.dtype == np.int64
+    assert np.array_equal(ndata.id, node_ids.astype(np.int64))
+    assert len(ndata.data) == 0
+    assert ndata.type == '_V'
+    assert np.array_equal(ndata.graph_id, np.full(num_nodes, 0))
+    # NodeData more
+    data = {'feat': np.random.rand(num_nodes, 3)}
+    graph_id = np.arange(num_nodes)
+    ndata = csv_ds.NodeData(node_ids, data, type='user', graph_id=graph_id)
+    assert ndata.type == 'user'
+    assert np.array_equal(ndata.graph_id, graph_id)
+    assert len(ndata.data) == len(data)
+    for k, v in data.items():
+        assert k in ndata.data
+        assert np.array_equal(ndata.data[k], v)
+    # NodeData except
+    expect_except = False
+    try:
+        csv_ds.NodeData(np.arange(num_nodes), {'feat': np.random.rand(
+            num_nodes+1, 3)}, graph_id=np.arange(num_nodes-1))
+    except:
+        expect_except = True
+    assert expect_except
+
+    # EdgeData basics
+    num_nodes = 100
+    num_edges = 1000
+    src_ids = np.random.randint(num_nodes, size=num_edges)
+    dst_ids = np.random.randint(num_nodes, size=num_edges)
+    edata = csv_ds.EdgeData(src_ids, dst_ids, {})
+    assert np.array_equal(edata.src, src_ids)
+    assert np.array_equal(edata.dst, dst_ids)
+    assert edata.type == ('_V', '_E', '_V')
+    assert len(edata.data) == 0
+    assert np.array_equal(edata.graph_id, np.full(num_edges, 0))
+    # EdageData more
+    src_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float)
+    dst_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float)
+    data = {'feat': np.random.rand(num_edges, 3)}
+    etype = ('user', 'like', 'item')
+    graph_ids = np.arange(num_edges)
+    edata = csv_ds.EdgeData(src_ids, dst_ids, data,
+                            type=etype, graph_id=graph_ids)
+    assert edata.src.dtype == np.int64
+    assert edata.dst.dtype == np.int64
+    assert np.array_equal(edata.src, src_ids)
+    assert np.array_equal(edata.dst, dst_ids)
+    assert edata.type == etype
+    assert len(edata.data) == len(data)
+    for k, v in data.items():
+        assert k in edata.data
+        assert np.array_equal(edata.data[k], v)
+    assert np.array_equal(edata.graph_id, graph_ids)
+    # EdgeData except
+    expect_except = False
+    try:
+        csv_ds.EdgeData(np.arange(num_edges), np.arange(
+            num_edges+1), {'feat': np.random.rand(num_edges-1, 3)}, graph_id=np.arange(num_edges+2))
+    except:
+        expect_except = True
+    assert expect_except
+
+    # GraphData basics
+    num_graphs = 10
+    graph_ids = np.arange(num_graphs)
+    gdata = csv_ds.GraphData(graph_ids, {})
+    assert np.array_equal(gdata.graph_id, graph_ids)
+    assert len(gdata.data) == 0
+    # GraphData more
+    graph_ids = np.arange(num_graphs).astype(np.float)
+    data = {'feat': np.random.rand(num_graphs, 3)}
+    gdata = csv_ds.GraphData(graph_ids, data)
+    assert gdata.graph_id.dtype == np.int64
+    assert np.array_equal(gdata.graph_id, graph_ids)
+    assert len(gdata.data) == len(data)
+    for k, v in data.items():
+        assert k in gdata.data
+        assert np.array_equal(gdata.data[k], v)
+
+
+@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
+def test_csvdataset():
+    _test_NodeEdgeGraphData()
+    _test_construct_graphs_homo()
+    _test_construct_graphs_hetero()
+    _test_construct_graphs_multiple()
+    _test_DefaultDataParser()
+    _test_load_yaml_with_sanity_check()
+    _test_load_node_data_from_csv()
+    _test_load_edge_data_from_csv()
+    _test_load_graph_data_from_csv()
+    _test_DGLCSVDataset_single()
+    _test_DGLCSVDataset_multiple()
+    _test_DGLCSVDataset_customized_data_parser()
+
+
 if __name__ == '__main__':
    test_minigc()
    test_gin()
@@ -164,3 +1014,4 @@ if __name__ == '__main__':
    test_fraud()
    test_fakenews()
    test_extract_archive()
+    test_csvdataset()
--- a/tests/scripts/task_unit_test.bat
+++ b/tests/scripts/task_unit_test.bat
@@ -14,7 +14,7 @@ SET DGLBACKEND=!BACKEND!
 SET DGL_LIBRARY_PATH=!CD!\build
 SET DGL_DOWNLOAD_DIR=!CD!

-python -m pip install pytest || EXIT /B 1
+python -m pip install pytest pyyaml pandas pydantic || EXIT /B 1
 python -m pytest -v --junitxml=pytest_backend.xml tests\!DGLBACKEND! || EXIT /B 1
 python -m pytest -v --junitxml=pytest_compute.xml tests\compute || EXIT /B 1
 ENDLOCAL

--- a/tests/scripts/task_unit_test.sh
+++ b/tests/scripts/task_unit_test.sh
@@ -32,6 +32,7 @@ fi

 conda activate ${DGLBACKEND}-ci

+python3 -m pip install pytest pyyaml pandas pydantic || EXIT /B 1
 python3 -m pytest -v --junitxml=pytest_compute.xml tests/compute || fail "compute"
 python3 -m pytest -v --junitxml=pytest_backend.xml tests/$DGLBACKEND || fail "backend-specific"