gb_test_utils.py

import os

import dgl.graphbolt as gb

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch


def minibatch_node_collator(data):
    minibatch = gb.MiniBatch(seed_node=data)
    return minibatch


def minibatch_link_collator(data):
    minibatch = gb.MiniBatch(node_pair=data)
    return minibatch


def rand_csc_graph(N, density):
    adj = sp.random(N, N, density)
    adj = adj + adj.T
    adj = adj.tocsc()

    indptr = torch.LongTensor(adj.indptr)
    indices = torch.LongTensor(adj.indices)

    graph = gb.from_csc(indptr, indices)

    return graph


def random_homo_graph(num_nodes, num_edges):
    csc_indptr = torch.randint(0, num_edges, (num_nodes + 1,))
    csc_indptr = torch.sort(csc_indptr)[0]
    csc_indptr[0] = 0
    csc_indptr[-1] = num_edges
    indices = torch.randint(0, num_nodes, (num_edges,))
    return csc_indptr, indices


def get_metadata(num_ntypes, num_etypes):
    ntypes = {f"n{i}": i for i in range(num_ntypes)}
    etypes = {}
    count = 0
    for n1 in range(num_ntypes):
        for n2 in range(n1, num_ntypes):
            if count >= num_etypes:
                break
            etypes.update({f"n{n1}:e{count}:n{n2}": count})
            count += 1
    return gb.GraphMetadata(ntypes, etypes)


def random_hetero_graph(num_nodes, num_edges, num_ntypes, num_etypes):
    csc_indptr, indices = random_homo_graph(num_nodes, num_edges)
    metadata = get_metadata(num_ntypes, num_etypes)
    # Randomly get node type split point.
    node_type_offset = torch.sort(
        torch.randint(0, num_nodes, (num_ntypes + 1,))
    )[0]
    node_type_offset[0] = 0
    node_type_offset[-1] = num_nodes

    type_per_edge = []
    for i in range(num_nodes):
        num = csc_indptr[i + 1] - csc_indptr[i]
        type_per_edge.append(
            torch.sort(torch.randint(0, num_etypes, (num,)))[0]
        )
    type_per_edge = torch.cat(type_per_edge, dim=0)
    return (csc_indptr, indices, node_type_offset, type_per_edge, metadata)


def random_homo_graphbolt_graph(
    test_dir, dataset_name, num_nodes, num_edges, num_classes
):
    """Generate random graphbolt version homograph"""
    # Generate random edges.
    nodes = np.repeat(np.arange(num_nodes), 5)
    neighbors = np.random.randint(0, num_nodes, size=(num_edges))
    edges = np.stack([nodes, neighbors], axis=1)
    # Wrtie into edges/edge.csv
    os.makedirs(os.path.join(test_dir, "edges"), exist_ok=True)
    edges = pd.DataFrame(edges, columns=["src", "dst"])
    edge_path = os.path.join("edges", "edge.csv")
    edges.to_csv(
        os.path.join(test_dir, edge_path),
        index=False,
        header=False,
    )

    # Generate random graph edge-feats.
    edge_feats = np.random.rand(num_edges, 5)
    os.makedirs(os.path.join(test_dir, "data"), exist_ok=True)
    edge_feat_path = os.path.join("data", "edge-feat.npy")
    np.save(os.path.join(test_dir, edge_feat_path), edge_feats)

    # Generate random node-feats.
    node_feats = np.random.rand(num_nodes, num_classes)
    node_feat_path = os.path.join("data", "node-feat.npy")
    np.save(os.path.join(test_dir, node_feat_path), node_feats)

    # Generate train/test/valid set.
    assert num_nodes % 4 == 0, "num_nodes must be divisible by 4"
    each_set_size = num_nodes // 4
    os.makedirs(os.path.join(test_dir, "set"), exist_ok=True)
    train_pairs = (
        np.arange(each_set_size),
        np.arange(each_set_size, 2 * each_set_size),
    )
    train_labels = np.random.randint(0, num_classes, size=each_set_size)
    train_data = np.vstack([train_pairs, train_labels]).T
    train_path = os.path.join("set", "train.npy")
    np.save(os.path.join(test_dir, train_path), train_data)

    validation_pairs = (
        np.arange(each_set_size, 2 * each_set_size),
        np.arange(2 * each_set_size, 3 * each_set_size),
    )
    validation_labels = np.random.randint(0, num_classes, size=each_set_size)
    validation_data = np.vstack([validation_pairs, validation_labels]).T
    validation_path = os.path.join("set", "validation.npy")
    np.save(os.path.join(test_dir, validation_path), validation_data)

    test_pairs = (
        np.arange(2 * each_set_size, 3 * each_set_size),
        np.arange(3 * each_set_size, 4 * each_set_size),
    )
    test_labels = np.random.randint(0, num_classes, size=each_set_size)
    test_data = np.vstack([test_pairs, test_labels]).T
    test_path = os.path.join("set", "test.npy")
    np.save(os.path.join(test_dir, test_path), test_data)

    yaml_content = f"""
        dataset_name: {dataset_name}
        graph: # graph structure and required attributes.
            nodes:
                - num: {num_nodes}
            edges:
                - format: csv
                  path: {edge_path}
            feature_data:
                - domain: edge
                  type: null
                  name: feat
                  format: numpy
                  in_memory: true
                  path: {edge_feat_path}
        feature_data:
            - domain: node
              type: null
              name: feat
              format: numpy
              in_memory: false
              path: {node_feat_path}
        tasks:
          - name: node_classification
            num_classes: {num_classes}
            train_set:
              - type_name: null
                data:
                  - format: numpy
                    path: {train_path}
            validation_set:
              - type_name: null
                data:
                  - format: numpy
                    path: {validation_path}
            test_set:
              - type_name: null
                data:
                  - format: numpy
                    path: {test_path}
    """
    return yaml_content