Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
ffe2871b
Unverified
Commit
ffe2871b
authored
Dec 11, 2023
by
OlegPlatonov
Committed by
GitHub
Dec 12, 2023
Browse files
[Feature] Adjusted homophily and label informativeness (#6516)
Co-authored-by:
rudongyu
<
ru_dongyu@outlook.com
>
parent
d873acc2
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
369 additions
and
4 deletions
+369
-4
docs/source/api/python/dgl.rst
docs/source/api/python/dgl.rst
+12
-0
python/dgl/__init__.py
python/dgl/__init__.py
+2
-1
python/dgl/homophily.py
python/dgl/homophily.py
+83
-3
python/dgl/label_informativeness.py
python/dgl/label_informativeness.py
+212
-0
tests/python/common/test_homophily.py
tests/python/common/test_homophily.py
+15
-0
tests/python/common/test_label_informativeness.py
tests/python/common/test_label_informativeness.py
+45
-0
No files found.
docs/source/api/python/dgl.rst
View file @
ffe2871b
...
@@ -215,6 +215,18 @@ Utilities for measuring homophily of a graph
...
@@ -215,6 +215,18 @@ Utilities for measuring homophily of a graph
edge_homophily
edge_homophily
node_homophily
node_homophily
linkx_homophily
linkx_homophily
adjusted_homophily
Label Informativeness Measures
-------------------------
Utilities for measuring label informativeness of a graph
.. autosummary::
:toctree: ../../generated/
edge_label_informativeness
node_label_informativeness
Utilities
Utilities
-----------------------------------------------
-----------------------------------------------
...
...
python/dgl/__init__.py
View file @
ffe2871b
...
@@ -48,7 +48,6 @@ from .heterograph import ( # pylint: disable=reimported
...
@@ -48,7 +48,6 @@ from .heterograph import ( # pylint: disable=reimported
DGLGraph
,
DGLGraph
,
DGLGraph
as
DGLHeteroGraph
,
DGLGraph
as
DGLHeteroGraph
,
)
)
from
.homophily
import
*
from
.merge
import
*
from
.merge
import
*
from
.subgraph
import
*
from
.subgraph
import
*
from
.traversal
import
*
from
.traversal
import
*
...
@@ -61,6 +60,8 @@ from .frame import LazyFeature
...
@@ -61,6 +60,8 @@ from .frame import LazyFeature
from
.global_config
import
is_libxsmm_enabled
,
use_libxsmm
from
.global_config
import
is_libxsmm_enabled
,
use_libxsmm
from
.utils
import
apply_each
from
.utils
import
apply_each
from
.mpops
import
*
from
.mpops
import
*
from
.homophily
import
*
from
.label_informativeness
import
*
if
backend_name
==
"pytorch"
:
if
backend_name
==
"pytorch"
:
from
.
import
distributed
from
.
import
distributed
python/dgl/homophily.py
View file @
ffe2871b
"""Utils for tacking graph homophily and heterophily"""
"""Utils for t
r
acking graph homophily and heterophily"""
# pylint: disable=W0611
# pylint: disable=W0611
from
.
import
function
as
fn
from
.
import
function
as
fn
,
to_bidirected
try
:
try
:
import
torch
import
torch
...
@@ -9,7 +9,12 @@ except ImportError:
...
@@ -9,7 +9,12 @@ except ImportError:
else
:
else
:
HAS_TORCH
=
True
HAS_TORCH
=
True
__all__
=
[
"node_homophily"
,
"edge_homophily"
,
"linkx_homophily"
]
__all__
=
[
"node_homophily"
,
"edge_homophily"
,
"linkx_homophily"
,
"adjusted_homophily"
,
]
def
check_pytorch
():
def
check_pytorch
():
...
@@ -187,3 +192,78 @@ def linkx_homophily(graph, y):
...
@@ -187,3 +192,78 @@ def linkx_homophily(graph, y):
value
+=
max
(
0
,
same_class_deg_k
/
deg_k
-
num_nodes_k
/
num_nodes
)
value
+=
max
(
0
,
same_class_deg_k
/
deg_k
-
num_nodes_k
/
num_nodes
)
return
value
.
item
()
/
(
num_classes
-
1
)
return
value
.
item
()
/
(
num_classes
-
1
)
def
adjusted_homophily
(
graph
,
y
):
r
"""Homophily measure recommended in `Characterizing Graph Datasets for
Node Classification: Homophily-Heterophily Dichotomy and Beyond
<https://arxiv.org/abs/2209.06177>`__
Adjusted homophily is edge homophily adjusted for the expected number of
edges connecting nodes with the same class label (taking into account the
number of classes, their sizes, and the distribution of node degrees among
them).
Mathematically it is defined as follows:
.. math::
\frac{h_{edge} - \sum_{k=1}^C \bar{p}(k)^2}
{1 - \sum_{k=1}^C \bar{p}(k)^2},
where :math:`h_{edge}` denotes edge homophily, :math:`C` denotes the
number of classes, and :math:`\bar{p}(\cdot)` is the empirical
degree-weighted distribution of classes:
:math:`\bar{p}(k) = \frac{\sum_{v\,:\,y_v = k} d(v)}{2|E|}`,
where :math:`d(v)` is the degree of node :math:`v`.
It has been shown that adjusted homophily satisifes more desirable
properties than other homophily measures, which makes it appropriate for
comparing the levels of homophily across datasets with different number
of classes, different class sizes, andd different degree distributions
among classes.
Adjusted homophily can be negative. If adjusted homophily is zero, then
the edge pattern in the graph is independent of node class labels. If it
is positive, then the nodes in the graph tend to connect to nodes of the
same class more often, and if it is negative, than the nodes in the graph
tend to connect to nodes of different classes more often (compared to the
null model where edges are independent of node class labels).
Parameters
----------
graph : DGLGraph
The graph.
y : torch.Tensor
The node labels, which is a tensor of shape (|V|).
Returns
-------
float
The adjusted homophily value.
Examples
--------
>>> import dgl
>>> import torch
>>> graph = dgl.graph(([1, 2, 0, 4], [0, 1, 2, 3]))
>>> y = torch.tensor([0, 0, 0, 0, 1])
>>> dgl.adjusted_homophily(graph, y)
-0.1428571492433548
"""
check_pytorch
()
graph
=
to_bidirected
(
graph
.
cpu
()).
to
(
y
.
device
)
h_edge
=
edge_homophily
(
graph
,
y
)
degrees
=
graph
.
in_degrees
().
float
()
num_classes
=
y
.
max
().
item
()
+
1
degree_sums
=
torch
.
zeros
(
num_classes
).
to
(
y
.
device
)
degree_sums
.
index_add_
(
dim
=
0
,
index
=
y
,
source
=
degrees
)
adjust
=
(
degree_sums
**
2
).
sum
()
/
graph
.
num_edges
()
**
2
h_adj
=
(
h_edge
-
adjust
)
/
(
1
-
adjust
)
return
h_adj
.
item
()
python/dgl/label_informativeness.py
0 → 100644
View file @
ffe2871b
"""Utils for computing graph label informativeness"""
from
.
import
to_bidirected
try
:
import
torch
except
ImportError
:
HAS_TORCH
=
False
else
:
HAS_TORCH
=
True
__all__
=
[
"edge_label_informativeness"
,
"node_label_informativeness"
]
def
check_pytorch
():
"""Check if PyTorch is the backend."""
if
HAS_TORCH
is
False
:
raise
ModuleNotFoundError
(
"This function requires PyTorch to be the backend."
)
def
edge_label_informativeness
(
graph
,
y
,
eps
=
1e-8
):
r
"""Label informativeness (:math:`\mathrm{LI}`) is a characteristic of
labeled graphs proposed in the `Characterizing Graph Datasets for Node
Classification: Homophily-Heterophily Dichotomy and Beyond
<https://arxiv.org/abs/2209.06177>`__
Label informativeness shows how much information about a node's label we
get from knowing its neighbor's label. Formally, assume that we sample an
edge :math:`(\xi,\eta) \in E`. The class labels of nodes :math:`\xi` and
:math:`\eta` are then random variables :math:`y_\xi` and :math:`y_\eta`.
We want to measure the amount of knowledge the label :math:`y_\eta` gives
for predicting :math:`y_\xi`. The entropy :math:`H(y_\xi)` measures the
`hardness' of predicting the label of :math:`\xi` without knowing
:math:`y_\eta`. Given :math:`y_\eta`, this value is reduced to the
conditional entropy :math:`H(y_\xi|y_\eta)`. In other words, :math:`y_\eta`
reveals :math:`I(y_\xi,y_\eta) = H(y_\xi) - H(y_\xi|y_\eta)` information
about the label. To make the obtained quantity comparable across different
datasets, label informativeness is defined as the normalized mutual
information of :math:`y_{\xi}` and :math:`y_{\eta}`:
.. math::
\mathrm{LI} = \frac{I(y_\xi,y_\eta)}{H(y_\xi)}
Depending on the distribution used for sampling an edge
:math:`(\xi, \eta)`, several variants of label informativeness can be
obtained. Two of them are particularly intuitive: in edge label
informativeness (:math:`\mathrm{LI}_{edge}`), edges are sampled uniformly
at random, and in node label informativeness (:math:`\mathrm{LI}_{node}`),
first a node is sampled uniformly at random and then an edge incident to it
is sampled uniformly at random. These two versions of label informativeness
differ in how they weight high/low-degree nodes. In edge label
informativeness, averaging is over the edges, thus high-degree nodes are
given more weight. In node label informativeness, averaging is over the
nodes, so all nodes are weighted equally.
This function computes edge label informativeness.
Parameters
----------
graph : DGLGraph
The graph.
y : torch.Tensor
The node labels, which is a tensor of shape (|V|).
eps : float, optional
A small constant for numerical stability. (default: 1e-8)
Returns
-------
float
The edge label informativeness value.
Examples
--------
>>> import dgl
>>> import torch
>>> graph = dgl.graph(([0, 1, 2, 2, 3, 4], [1, 2, 0, 3, 4, 5]))
>>> y = torch.tensor([0, 0, 0, 0, 1, 1])
>>> dgl.edge_label_informativeness(graph, y)
0.25177597999572754
"""
check_pytorch
()
graph
=
to_bidirected
(
graph
.
cpu
()).
to
(
y
.
device
)
degrees
=
graph
.
in_degrees
().
float
()
num_classes
=
y
.
max
()
+
1
class_degree_weighted_probs
=
torch
.
zeros
(
num_classes
).
to
(
y
.
device
)
class_degree_weighted_probs
.
index_add_
(
dim
=
0
,
index
=
y
,
source
=
degrees
)
class_degree_weighted_probs
/=
class_degree_weighted_probs
.
sum
()
edge_probs
=
torch
.
zeros
(
num_classes
,
num_classes
).
to
(
y
.
device
)
labels_u
=
y
[
graph
.
edges
()[
0
].
long
()]
labels_v
=
y
[
graph
.
edges
()[
1
].
long
()]
edge_probs
.
index_put_
(
indices
=
(
labels_u
,
labels_v
),
values
=
torch
.
ones
(
graph
.
num_edges
()).
to
(
y
.
device
),
accumulate
=
True
,
)
edge_probs
/=
edge_probs
.
sum
()
edge_probs
+=
eps
numerator
=
(
edge_probs
*
torch
.
log
(
edge_probs
)).
sum
()
denominator
=
(
class_degree_weighted_probs
*
torch
.
log
(
class_degree_weighted_probs
)
).
sum
()
li_edge
=
2
-
numerator
/
denominator
return
li_edge
.
item
()
def
node_label_informativeness
(
graph
,
y
,
eps
=
1e-8
):
r
"""Label informativeness (:math:`\mathrm{LI}`) is a characteristic of
labeled graphs proposed in the `Characterizing Graph Datasets for Node
Classification: Homophily-Heterophily Dichotomy and Beyond
<https://arxiv.org/abs/2209.06177>`__
Label informativeness shows how much information about a node's label we
get from knowing its neighbor's label. Formally, assume that we sample an
edge :math:`(\xi,\eta) \in E`. The class labels of nodes :math:`\xi` and
:math:`\eta` are then random variables :math:`y_\xi` and :math:`y_\eta`.
We want to measure the amount of knowledge the label :math:`y_\eta` gives
for predicting :math:`y_\xi`. The entropy :math:`H(y_\xi)` measures the
`hardness' of predicting the label of :math:`\xi` without knowing
:math:`y_\eta`. Given :math:`y_\eta`, this value is reduced to the
conditional entropy :math:`H(y_\xi|y_\eta)`. In other words, :math:`y_\eta`
reveals :math:`I(y_\xi,y_\eta) = H(y_\xi) - H(y_\xi|y_\eta)` information
about the label. To make the obtained quantity comparable across different
datasets, label informativeness is defined as the normalized mutual
information of :math:`y_{\xi}` and :math:`y_{\eta}`:
.. math::
\mathrm{LI} = \frac{I(y_\xi,y_\eta)}{H(y_\xi)}
Depending on the distribution used for sampling an edge
:math:`(\xi, \eta)`, several variants of label informativeness can be
obtained. Two of them are particularly intuitive: in edge label
informativeness (:math:`\mathrm{LI}_{edge}`), edges are sampled uniformly
at random, and in node label informativeness (:math:`\mathrm{LI}_{node}`),
first a node is sampled uniformly at random and then an edge incident to it
is sampled uniformly at random. These two versions of label informativeness
differ in how they weight high/low-degree nodes. In edge label
informativeness, averaging is over the edges, thus high-degree nodes are
given more weight. In node label informativeness, averaging is over the
nodes, so all nodes are weighted equally.
This function computes node label informativeness.
Parameters
----------
graph : DGLGraph
The graph.
y : torch.Tensor
The node labels, which is a tensor of shape (|V|).
eps : float, optional
A small constant for numerical stability. (default: 1e-8)
Returns
-------
float
The node label informativeness value.
Examples
--------
>>> import dgl
>>> import torch
>>> graph = dgl.graph(([0, 1, 2, 2, 3, 4], [1, 2, 0, 3, 4, 5]))
>>> y = torch.tensor([0, 0, 0, 0, 1, 1])
>>> dgl.node_label_informativeness(graph, y)
0.3381872773170471
"""
check_pytorch
()
graph
=
to_bidirected
(
graph
.
cpu
()).
to
(
y
.
device
)
degrees
=
graph
.
in_degrees
().
float
()
num_classes
=
y
.
max
()
+
1
class_probs
=
torch
.
zeros
(
num_classes
).
to
(
y
.
device
)
class_probs
.
index_add_
(
dim
=
0
,
index
=
y
,
source
=
torch
.
ones
(
graph
.
num_nodes
()).
to
(
y
.
device
)
)
class_probs
/=
class_probs
.
sum
()
class_degree_weighted_probs
=
torch
.
zeros
(
num_classes
).
to
(
y
.
device
)
class_degree_weighted_probs
.
index_add_
(
dim
=
0
,
index
=
y
,
source
=
degrees
)
class_degree_weighted_probs
/=
class_degree_weighted_probs
.
sum
()
num_nonzero_degree_nodes
=
(
degrees
>
0
).
sum
()
edge_probs
=
torch
.
zeros
(
num_classes
,
num_classes
).
to
(
y
.
device
)
labels_u
=
y
[
graph
.
edges
()[
0
].
long
()]
labels_v
=
y
[
graph
.
edges
()[
1
].
long
()]
degrees_u
=
degrees
[
graph
.
edges
()[
0
].
long
()]
edge_probs
.
index_put_
(
indices
=
(
labels_u
,
labels_v
),
values
=
1
/
(
num_nonzero_degree_nodes
*
degrees_u
),
accumulate
=
True
,
)
edge_probs
+=
eps
log
=
torch
.
log
(
edge_probs
/
(
class_probs
[:,
None
]
*
class_degree_weighted_probs
[
None
,
:])
)
numerator
=
(
edge_probs
*
log
).
sum
()
denominator
=
(
class_probs
*
torch
.
log
(
class_probs
)).
sum
()
li_node
=
-
numerator
/
denominator
return
li_node
.
item
()
tests/python/common/test_homophily.py
View file @
ffe2871b
...
@@ -51,3 +51,18 @@ def test_linkx_homophily(idtype):
...
@@ -51,3 +51,18 @@ def test_linkx_homophily(idtype):
y
=
F
.
tensor
([
0
,
1
,
2
,
3
,
4
])
y
=
F
.
tensor
([
0
,
1
,
2
,
3
,
4
])
assert
math
.
isclose
(
dgl
.
linkx_homophily
(
graph
,
y
),
0.0000000000000000
)
assert
math
.
isclose
(
dgl
.
linkx_homophily
(
graph
,
y
),
0.0000000000000000
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only support PyTorch for now"
)
@
parametrize_idtype
def
test_adjusted_homophily
(
idtype
):
# IfChangeThenChange: python/dgl/homophily.py
# Update the docstring example.
device
=
F
.
ctx
()
graph
=
dgl
.
graph
(
([
1
,
2
,
0
,
4
],
[
0
,
1
,
2
,
3
]),
idtype
=
idtype
,
device
=
device
)
y
=
F
.
tensor
([
0
,
0
,
0
,
0
,
1
])
assert
math
.
isclose
(
dgl
.
adjusted_homophily
(
graph
,
y
),
-
0.1428571492433548
)
tests/python/common/test_label_informativeness.py
0 → 100644
View file @
ffe2871b
import
math
import
unittest
import
backend
as
F
import
dgl
from
utils
import
parametrize_idtype
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only support PyTorch for now"
)
@
parametrize_idtype
def
test_edge_label_informativeness
(
idtype
):
# IfChangeThenChange: python/dgl/label_informativeness.py
# Update the docstring example.
device
=
F
.
ctx
()
graph
=
dgl
.
graph
(
([
0
,
1
,
2
,
2
,
3
,
4
],
[
1
,
2
,
0
,
3
,
4
,
5
]),
idtype
=
idtype
,
device
=
device
)
y
=
F
.
tensor
([
0
,
0
,
0
,
0
,
1
,
1
])
assert
math
.
isclose
(
dgl
.
edge_label_informativeness
(
graph
,
y
),
0.25177597999572754
,
abs_tol
=
1e-6
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only support PyTorch for now"
)
@
parametrize_idtype
def
test_node_label_informativeness
(
idtype
):
# IfChangeThenChange: python/dgl/label_informativeness.py
# Update the docstring example.
device
=
F
.
ctx
()
graph
=
dgl
.
graph
(
([
0
,
1
,
2
,
2
,
3
,
4
],
[
1
,
2
,
0
,
3
,
4
,
5
]),
idtype
=
idtype
,
device
=
device
)
y
=
F
.
tensor
([
0
,
0
,
0
,
0
,
1
,
1
])
assert
math
.
isclose
(
dgl
.
node_label_informativeness
(
graph
,
y
),
0.3381872773170471
,
abs_tol
=
1e-6
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment