[Feature] Adjusted homophily and label informativeness (#6516)

Co-authored-by: rudongyu <ru_dongyu@outlook.com>

[Feature] Adjusted homophily and label informativeness (#6516)
Co-authored-by: rudongyu <ru_dongyu@outlook.com>
ffe2871b · OlegPlatonov · GitHub · d873acc2 · ffe2871b · ffe2871b
Unverified Commit ffe2871b authored Dec 11, 2023 by OlegPlatonov Committed by GitHub Dec 12, 2023
6 changed files
--- a/docs/source/api/python/dgl.rst
+++ b/docs/source/api/python/dgl.rst
@@ -215,6 +215,18 @@ Utilities for measuring homophily of a graph
    edge_homophily
    node_homophily
    linkx_homophily
+    adjusted_homophily
+
+Label Informativeness Measures
+-------------------------
+
+Utilities for measuring label informativeness of a graph
+
+.. autosummary::
+    :toctree: ../../generated/
+
+    edge_label_informativeness
+    node_label_informativeness

 Utilities
 -----------------------------------------------

--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -48,7 +48,6 @@ from .heterograph import (  # pylint: disable=reimported
    DGLGraph,
    DGLGraph as DGLHeteroGraph,
 )
-from .homophily import *
 from .merge import *
 from .subgraph import *
 from .traversal import *
@@ -61,6 +60,8 @@ from .frame import LazyFeature
 from .global_config import is_libxsmm_enabled, use_libxsmm
 from .utils import apply_each
 from .mpops import *
+from .homophily import *
+from .label_informativeness import *

 if backend_name == "pytorch":
    from . import distributed
--- a/python/dgl/homophily.py
+++ b/python/dgl/homophily.py
-"""Utils for tacking graph homophily and heterophily"""
+"""Utils for tracking graph homophily and heterophily"""
 # pylint: disable=W0611
-from . import function as fn
+from . import function as fn, to_bidirected

 try:
    import torch
@@ -9,7 +9,12 @@ except ImportError:
 else:
    HAS_TORCH = True

-__all__ = ["node_homophily", "edge_homophily", "linkx_homophily"]
+__all__ = [
+    "node_homophily",
+    "edge_homophily",
+    "linkx_homophily",
+    "adjusted_homophily",
+]


 def check_pytorch():
@@ -187,3 +192,78 @@ def linkx_homophily(graph, y):
            value += max(0, same_class_deg_k / deg_k - num_nodes_k / num_nodes)

        return value.item() / (num_classes - 1)
+
+
+def adjusted_homophily(graph, y):
+    r"""Homophily measure recommended in `Characterizing Graph Datasets for
+    Node Classification: Homophily-Heterophily Dichotomy and Beyond
+    <https://arxiv.org/abs/2209.06177>`__
+
+    Adjusted homophily is edge homophily adjusted for the expected number of
+    edges connecting nodes with the same class label (taking into account the
+    number of classes, their sizes, and the distribution of node degrees among
+    them).
+
+    Mathematically it is defined as follows:
+
+    .. math::
+        \frac{h_{edge} - \sum_{k=1}^C \bar{p}(k)^2}
+        {1 - \sum_{k=1}^C \bar{p}(k)^2},
+
+    where :math:`h_{edge}` denotes edge homophily, :math:`C` denotes the
+    number of classes, and :math:`\bar{p}(\cdot)` is the empirical
+    degree-weighted distribution of classes:
+    :math:`\bar{p}(k) = \frac{\sum_{v\,:\,y_v = k} d(v)}{2|E|}`,
+    where :math:`d(v)` is the degree of node :math:`v`.
+
+    It has been shown that adjusted homophily satisifes more desirable
+    properties than other homophily measures, which makes it appropriate for
+    comparing the levels of homophily across datasets with different number
+    of classes, different class sizes, andd different degree distributions
+    among classes.
+
+    Adjusted homophily can be negative. If adjusted homophily is zero, then
+    the edge pattern in the graph is independent of node class labels. If it
+    is positive, then the nodes in the graph tend to connect to nodes of the
+    same class more often, and if it is negative, than the nodes in the graph
+    tend to connect to nodes of different classes more often (compared to the
+    null model where edges are independent of node class labels).
+
+    Parameters
+    ----------
+    graph : DGLGraph
+        The graph.
+    y : torch.Tensor
+        The node labels, which is a tensor of shape (|V|).
+
+    Returns
+    -------
+    float
+        The adjusted homophily value.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import torch
+
+    >>> graph = dgl.graph(([1, 2, 0, 4], [0, 1, 2, 3]))
+    >>> y = torch.tensor([0, 0, 0, 0, 1])
+    >>> dgl.adjusted_homophily(graph, y)
+    -0.1428571492433548
+    """
+    check_pytorch()
+
+    graph = to_bidirected(graph.cpu()).to(y.device)
+
+    h_edge = edge_homophily(graph, y)
+
+    degrees = graph.in_degrees().float()
+    num_classes = y.max().item() + 1
+    degree_sums = torch.zeros(num_classes).to(y.device)
+    degree_sums.index_add_(dim=0, index=y, source=degrees)
+
+    adjust = (degree_sums**2).sum() / graph.num_edges() ** 2
+
+    h_adj = (h_edge - adjust) / (1 - adjust)
+
+    return h_adj.item()
--- a/python/dgl/label_informativeness.py
+++ b/python/dgl/label_informativeness.py
+"""Utils for computing graph label informativeness"""
+from . import to_bidirected
+
+try:
+    import torch
+except ImportError:
+    HAS_TORCH = False
+else:
+    HAS_TORCH = True
+
+__all__ = ["edge_label_informativeness", "node_label_informativeness"]
+
+
+def check_pytorch():
+    """Check if PyTorch is the backend."""
+    if HAS_TORCH is False:
+        raise ModuleNotFoundError(
+            "This function requires PyTorch to be the backend."
+        )
+
+
+def edge_label_informativeness(graph, y, eps=1e-8):
+    r"""Label informativeness (:math:`\mathrm{LI}`) is a characteristic of
+    labeled graphs proposed in the `Characterizing Graph Datasets for Node
+    Classification: Homophily-Heterophily Dichotomy and Beyond
+    <https://arxiv.org/abs/2209.06177>`__
+
+    Label informativeness shows how much information about a node's label we
+    get from knowing its neighbor's label. Formally, assume that we sample an
+    edge :math:`(\xi,\eta) \in E`. The class labels of nodes :math:`\xi` and
+    :math:`\eta` are then random variables :math:`y_\xi` and :math:`y_\eta`.
+    We want to measure the amount of knowledge the label :math:`y_\eta` gives
+    for predicting :math:`y_\xi`. The entropy :math:`H(y_\xi)` measures the
+    `hardness' of predicting the label of :math:`\xi` without knowing
+    :math:`y_\eta`. Given :math:`y_\eta`, this value is reduced to the
+    conditional entropy :math:`H(y_\xi|y_\eta)`. In other words, :math:`y_\eta`
+    reveals :math:`I(y_\xi,y_\eta) = H(y_\xi) - H(y_\xi|y_\eta)` information
+    about the label. To make the obtained quantity comparable across different
+    datasets, label informativeness is defined as the normalized mutual
+    information of :math:`y_{\xi}` and :math:`y_{\eta}`:
+
+    .. math::
+      \mathrm{LI} = \frac{I(y_\xi,y_\eta)}{H(y_\xi)}
+
+    Depending on the distribution used for sampling an edge
+    :math:`(\xi, \eta)`, several variants of label informativeness can be
+    obtained. Two of them are particularly intuitive: in edge label
+    informativeness (:math:`\mathrm{LI}_{edge}`), edges are sampled uniformly
+    at random, and in node label informativeness (:math:`\mathrm{LI}_{node}`),
+    first a node is sampled uniformly at random and then an edge incident to it
+    is sampled uniformly at random. These two versions of label informativeness
+    differ in how they weight high/low-degree nodes. In edge label
+    informativeness, averaging is over the edges, thus high-degree nodes are
+    given more weight. In node label informativeness, averaging is over the
+    nodes, so all nodes are weighted equally.
+
+    This function computes edge label informativeness.
+
+    Parameters
+    ----------
+    graph : DGLGraph
+        The graph.
+    y : torch.Tensor
+        The node labels, which is a tensor of shape (|V|).
+    eps : float, optional
+        A small constant for numerical stability. (default: 1e-8)
+
+    Returns
+    -------
+    float
+        The edge label informativeness value.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import torch
+
+    >>> graph = dgl.graph(([0, 1, 2, 2, 3, 4], [1, 2, 0, 3, 4, 5]))
+    >>> y = torch.tensor([0, 0, 0, 0, 1, 1])
+    >>> dgl.edge_label_informativeness(graph, y)
+    0.25177597999572754
+    """
+    check_pytorch()
+
+    graph = to_bidirected(graph.cpu()).to(y.device)
+
+    degrees = graph.in_degrees().float()
+    num_classes = y.max() + 1
+    class_degree_weighted_probs = torch.zeros(num_classes).to(y.device)
+    class_degree_weighted_probs.index_add_(dim=0, index=y, source=degrees)
+    class_degree_weighted_probs /= class_degree_weighted_probs.sum()
+
+    edge_probs = torch.zeros(num_classes, num_classes).to(y.device)
+    labels_u = y[graph.edges()[0].long()]
+    labels_v = y[graph.edges()[1].long()]
+    edge_probs.index_put_(
+        indices=(labels_u, labels_v),
+        values=torch.ones(graph.num_edges()).to(y.device),
+        accumulate=True,
+    )
+    edge_probs /= edge_probs.sum()
+    edge_probs += eps
+
+    numerator = (edge_probs * torch.log(edge_probs)).sum()
+    denominator = (
+        class_degree_weighted_probs * torch.log(class_degree_weighted_probs)
+    ).sum()
+    li_edge = 2 - numerator / denominator
+
+    return li_edge.item()
+
+
+def node_label_informativeness(graph, y, eps=1e-8):
+    r"""Label informativeness (:math:`\mathrm{LI}`) is a characteristic of
+    labeled graphs proposed in the `Characterizing Graph Datasets for Node
+    Classification: Homophily-Heterophily Dichotomy and Beyond
+    <https://arxiv.org/abs/2209.06177>`__
+
+    Label informativeness shows how much information about a node's label we
+    get from knowing its neighbor's label. Formally, assume that we sample an
+    edge :math:`(\xi,\eta) \in E`. The class labels of nodes :math:`\xi` and
+    :math:`\eta` are then random variables :math:`y_\xi` and :math:`y_\eta`.
+    We want to measure the amount of knowledge the label :math:`y_\eta` gives
+    for predicting :math:`y_\xi`. The entropy :math:`H(y_\xi)` measures the
+    `hardness' of predicting the label of :math:`\xi` without knowing
+    :math:`y_\eta`. Given :math:`y_\eta`, this value is reduced to the
+    conditional entropy :math:`H(y_\xi|y_\eta)`. In other words, :math:`y_\eta`
+    reveals :math:`I(y_\xi,y_\eta) = H(y_\xi) - H(y_\xi|y_\eta)` information
+    about the label. To make the obtained quantity comparable across different
+    datasets, label informativeness is defined as the normalized mutual
+    information of :math:`y_{\xi}` and :math:`y_{\eta}`:
+
+    .. math::
+      \mathrm{LI} = \frac{I(y_\xi,y_\eta)}{H(y_\xi)}
+
+    Depending on the distribution used for sampling an edge
+    :math:`(\xi, \eta)`, several variants of label informativeness can be
+    obtained. Two of them are particularly intuitive: in edge label
+    informativeness (:math:`\mathrm{LI}_{edge}`), edges are sampled uniformly
+    at random, and in node label informativeness (:math:`\mathrm{LI}_{node}`),
+    first a node is sampled uniformly at random and then an edge incident to it
+    is sampled uniformly at random. These two versions of label informativeness
+    differ in how they weight high/low-degree nodes. In edge label
+    informativeness, averaging is over the edges, thus high-degree nodes are
+    given more weight. In node label informativeness, averaging is over the
+    nodes, so all nodes are weighted equally.
+
+    This function computes node label informativeness.
+
+    Parameters
+    ----------
+    graph : DGLGraph
+        The graph.
+    y : torch.Tensor
+        The node labels, which is a tensor of shape (|V|).
+    eps : float, optional
+        A small constant for numerical stability. (default: 1e-8)
+
+    Returns
+    -------
+    float
+        The node label informativeness value.
+
+    Examples
+    --------
+    >>> import dgl
+    >>> import torch
+
+    >>> graph = dgl.graph(([0, 1, 2, 2, 3, 4], [1, 2, 0, 3, 4, 5]))
+    >>> y = torch.tensor([0, 0, 0, 0, 1, 1])
+    >>> dgl.node_label_informativeness(graph, y)
+    0.3381872773170471
+    """
+    check_pytorch()
+
+    graph = to_bidirected(graph.cpu()).to(y.device)
+
+    degrees = graph.in_degrees().float()
+    num_classes = y.max() + 1
+
+    class_probs = torch.zeros(num_classes).to(y.device)
+    class_probs.index_add_(
+        dim=0, index=y, source=torch.ones(graph.num_nodes()).to(y.device)
+    )
+    class_probs /= class_probs.sum()
+
+    class_degree_weighted_probs = torch.zeros(num_classes).to(y.device)
+    class_degree_weighted_probs.index_add_(dim=0, index=y, source=degrees)
+    class_degree_weighted_probs /= class_degree_weighted_probs.sum()
+
+    num_nonzero_degree_nodes = (degrees > 0).sum()
+
+    edge_probs = torch.zeros(num_classes, num_classes).to(y.device)
+    labels_u = y[graph.edges()[0].long()]
+    labels_v = y[graph.edges()[1].long()]
+    degrees_u = degrees[graph.edges()[0].long()]
+    edge_probs.index_put_(
+        indices=(labels_u, labels_v),
+        values=1 / (num_nonzero_degree_nodes * degrees_u),
+        accumulate=True,
+    )
+    edge_probs += eps
+
+    log = torch.log(
+        edge_probs
+        / (class_probs[:, None] * class_degree_weighted_probs[None, :])
+    )
+    numerator = (edge_probs * log).sum()
+    denominator = (class_probs * torch.log(class_probs)).sum()
+    li_node = -numerator / denominator
+
+    return li_node.item()
--- a/tests/python/common/test_homophily.py
+++ b/tests/python/common/test_homophily.py
@@ -51,3 +51,18 @@ def test_linkx_homophily(idtype):

    y = F.tensor([0, 1, 2, 3, 4])
    assert math.isclose(dgl.linkx_homophily(graph, y), 0.0000000000000000)
+
+
+@unittest.skipIf(
+    dgl.backend.backend_name != "pytorch", reason="Only support PyTorch for now"
+)
+@parametrize_idtype
+def test_adjusted_homophily(idtype):
+    # IfChangeThenChange: python/dgl/homophily.py
+    # Update the docstring example.
+    device = F.ctx()
+    graph = dgl.graph(
+        ([1, 2, 0, 4], [0, 1, 2, 3]), idtype=idtype, device=device
+    )
+    y = F.tensor([0, 0, 0, 0, 1])
+    assert math.isclose(dgl.adjusted_homophily(graph, y), -0.1428571492433548)
--- a/tests/python/common/test_label_informativeness.py
+++ b/tests/python/common/test_label_informativeness.py
+import math
+import unittest
+
+import backend as F
+
+import dgl
+from utils import parametrize_idtype
+
+
+@unittest.skipIf(
+    dgl.backend.backend_name != "pytorch", reason="Only support PyTorch for now"
+)
+@parametrize_idtype
+def test_edge_label_informativeness(idtype):
+    # IfChangeThenChange: python/dgl/label_informativeness.py
+    # Update the docstring example.
+    device = F.ctx()
+    graph = dgl.graph(
+        ([0, 1, 2, 2, 3, 4], [1, 2, 0, 3, 4, 5]), idtype=idtype, device=device
+    )
+    y = F.tensor([0, 0, 0, 0, 1, 1])
+    assert math.isclose(
+        dgl.edge_label_informativeness(graph, y),
+        0.25177597999572754,
+        abs_tol=1e-6,
+    )
+
+
+@unittest.skipIf(
+    dgl.backend.backend_name != "pytorch", reason="Only support PyTorch for now"
+)
+@parametrize_idtype
+def test_node_label_informativeness(idtype):
+    # IfChangeThenChange: python/dgl/label_informativeness.py
+    # Update the docstring example.
+    device = F.ctx()
+    graph = dgl.graph(
+        ([0, 1, 2, 2, 3, 4], [1, 2, 0, 3, 4, 5]), idtype=idtype, device=device
+    )
+    y = F.tensor([0, 0, 0, 0, 1, 1])
+    assert math.isclose(
+        dgl.node_label_informativeness(graph, y),
+        0.3381872773170471,
+        abs_tol=1e-6,
+    )