"""QM9 dataset for graph property prediction (regression)."""
import os
import numpy as np
import scipy.sparse as sp
import torch
import dgl
from tqdm import trange
from dgl.data import QM9Dataset
from dgl.data.utils import load_graphs, save_graphs
from dgl.convert import graph as dgl_graph
class QM9(QM9Dataset):
r"""QM9 dataset for graph property prediction (regression)
This dataset consists of 130,831 molecules with 12 regression targets.
Nodes correspond to atoms and edges correspond to bonds.
Reference:
- `"Quantum-Machine.org" `_
- `"Directional Message Passing for Molecular Graphs" `_
Statistics:
- Number of graphs: 130,831
- Number of regression targets: 12
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| Keys | Property | Description | Unit |
+========+==================================+===================================================================================+=============================================+
| mu | :math:`\mu` | Dipole moment | :math:`\textrm{D}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| alpha | :math:`\alpha` | Isotropic polarizability | :math:`{a_0}^3` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| homo | :math:`\epsilon_{\textrm{HOMO}}` | Highest occupied molecular orbital energy | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| lumo | :math:`\epsilon_{\textrm{LUMO}}` | Lowest unoccupied molecular orbital energy | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| gap | :math:`\Delta \epsilon` | Gap between :math:`\epsilon_{\textrm{HOMO}}` and :math:`\epsilon_{\textrm{LUMO}}` | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| r2 | :math:`\langle R^2 \rangle` | Electronic spatial extent | :math:`{a_0}^2` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| zpve | :math:`\textrm{ZPVE}` | Zero point vibrational energy | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| U0 | :math:`U_0` | Internal energy at 0K | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| U | :math:`U` | Internal energy at 298.15K | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| H | :math:`H` | Enthalpy at 298.15K | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| G | :math:`G` | Free energy at 298.15K | :math:`\textrm{eV}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
| Cv | :math:`c_{\textrm{v}}` | Heat capavity at 298.15K | :math:`\frac{\textrm{cal}}{\textrm{mol K}}` |
+--------+----------------------------------+-----------------------------------------------------------------------------------+---------------------------------------------+
Parameters
----------
label_keys: list
Names of the regression property, which should be a subset of the keys in the table above.
edge_funcs: list
A list of edge-wise user-defined functions for chemical bonds. Default: None
cutoff: float
Cutoff distance for interatomic interactions, i.e. two atoms are connected in the corresponding graph if the distance between them is no larger than this.
Default: 5.0 Angstrom
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: True
Attributes
----------
num_labels : int
Number of labels for each graph, i.e. number of prediction tasks
Raises
------
UserWarning
If the raw data is changed in the remote server by the author.
Examples
--------
>>> data = QM9Dataset(label_keys=['mu', 'gap'], cutoff=5.0)
>>> data.num_labels
2
>>>
>>> # iterate over the dataset
>>> for g, label in data:
... R = g.ndata['R'] # get coordinates of each atom
... Z = g.ndata['Z'] # get atomic numbers of each atom
... # your code here...
>>>
"""
def __init__(self,
label_keys,
edge_funcs=None,
cutoff=5.0,
raw_dir=None,
force_reload=False,
verbose=False):
self.edge_funcs = edge_funcs
self._keys = ['mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
super(QM9, self).__init__(label_keys=label_keys,
cutoff=cutoff,
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose)
def has_cache(self):
""" step 1, if True, goto step 5; else goto download(step 2), then step 3"""
graph_path = f'{self.save_path}/dgl_graph.bin'
line_graph_path = f'{self.save_path}/dgl_line_graph.bin'
return os.path.exists(graph_path) and os.path.exists(line_graph_path)
def process(self):
""" step 3 """
npz_path = f'{self.raw_dir}/qm9_eV.npz'
data_dict = np.load(npz_path, allow_pickle=True)
# data_dict['N'] contains the number of atoms in each molecule,
# data_dict['R'] consists of the atomic coordinates,
# data_dict['Z'] consists of the atomic numbers.
# Atomic properties (Z and R) of all molecules are concatenated as single tensors,
# so you need this value to select the correct atoms for each molecule.
self.N = data_dict['N']
self.R = data_dict['R']
self.Z = data_dict['Z']
self.N_cumsum = np.concatenate([[0], np.cumsum(self.N)])
# graph labels
self.label_dict = {}
for k in self._keys:
self.label_dict[k] = torch.tensor(data_dict[k], dtype=torch.float32)
self.label = torch.stack([self.label_dict[key] for key in self.label_keys], dim=1)
# graphs & features
self.graphs, self.line_graphs = self._load_graph()
def _load_graph(self):
num_graphs = self.label.shape[0]
graphs = []
line_graphs = []
for idx in trange(num_graphs):
n_atoms = self.N[idx]
# get all the atomic coordinates of the idx-th molecular graph
R = self.R[self.N_cumsum[idx]:self.N_cumsum[idx + 1]]
# calculate the distance between all atoms
dist = np.linalg.norm(R[:, None, :] - R[None, :, :], axis=-1)
# keep all edges that don't exceed the cutoff and delete self-loops
adj = sp.csr_matrix(dist <= self.cutoff) - sp.eye(n_atoms, dtype=np.bool)
adj = adj.tocoo()
u, v = torch.tensor(adj.row), torch.tensor(adj.col)
g = dgl_graph((u, v))
g.ndata['R'] = torch.tensor(R, dtype=torch.float32)
g.ndata['Z'] = torch.tensor(self.Z[self.N_cumsum[idx]:self.N_cumsum[idx + 1]], dtype=torch.long)
# add user-defined features
if self.edge_funcs is not None:
for func in self.edge_funcs:
g.apply_edges(func)
graphs.append(g)
l_g = dgl.line_graph(g, backtracking=False)
line_graphs.append(l_g)
return graphs, line_graphs
def save(self):
""" step 4 """
graph_path = f'{self.save_path}/dgl_graph.bin'
line_graph_path = f'{self.save_path}/dgl_line_graph.bin'
save_graphs(str(graph_path), self.graphs, self.label_dict)
save_graphs(str(line_graph_path), self.line_graphs)
def load(self):
""" step 5 """
graph_path = f'{self.save_path}/dgl_graph.bin'
line_graph_path = f'{self.save_path}/dgl_line_graph.bin'
self.graphs, label_dict = load_graphs(graph_path)
self.line_graphs, _ = load_graphs(line_graph_path)
self.label = torch.stack([label_dict[key] for key in self.label_keys], dim=1)
def __getitem__(self, idx):
r""" Get graph and label by index
Parameters
----------
idx : int
Item index
Returns
-------
dgl.DGLGraph
The graph contains:
- ``ndata['R']``: the coordinates of each atom
- ``ndata['Z']``: the atomic number
Tensor
Property values of molecular graphs
"""
return self.graphs[idx], self.line_graphs[idx], self.label[idx]