Unverified Commit fb02aa2d authored by Tong He's avatar Tong He Committed by GitHub
Browse files

[Dataset] Add hash function to distinguish instances from the same dataset class (#1894)

* add hash function in base data classes

* update doc
parent 71c7ee0e
......@@ -39,16 +39,24 @@ class DGLDataset(object):
save_dir : str
Directory to save the processed dataset.
Default: same as raw_dir
hash_key : tuple
A tuple of values as the input for the hash function.
Users can distinguish instances (and their caches on the disk)
from the same dataset class by comparing the hash values.
Default: (), the corresponding hash value is 3527539
force_reload : bool
Whether to reload the dataset. Default: False
verbose : bool
Whether to print out progress information
"""
def __init__(self, name, url=None, raw_dir=None, save_dir=None, force_reload=False, verbose=False):
def __init__(self, name, url=None, raw_dir=None, save_dir=None,
hash_key=(), force_reload=False, verbose=False):
self._name = name
self._url = url
self._force_reload = force_reload
self._verbose = verbose
self._hash_key = hask_key
self._hash = self._get_hash()
# if no dir is provided, the default dgl download dir is used.
if raw_dir is None:
......@@ -148,6 +156,17 @@ class DGLDataset(object):
if self.verbose:
print('Done saving data into cached files.')
def _get_hash(self):
"""Compute the hash of the input tuple
Example
-------
>>> hash_value = self._get_hash((10, False, True))
>>> hash_value
6299899980521991026
"""
return abs(hash(self._hash_key))
@property
def url(self):
r"""Get url to download the raw dataset.
......@@ -191,6 +210,12 @@ class DGLDataset(object):
"""
return self._verbose
@property
def hash(self):
r"""Hash value for the dataset.
"""
return self._hash
@abc.abstractmethod
def __getitem__(self, idx):
r"""Gets the data object at index.
......@@ -215,16 +240,21 @@ class DGLBuiltinDataset(DGLDataset):
downloaded data or the directory that
already stores the input data.
Default: ~/.dgl/
hash_key : tuple
A tuple of values as the input for the hash function.
Users can distinguish instances (and their caches on the disk)
from the same dataset class by comparing the hash values.
force_reload : bool
Whether to reload the dataset. Default: False
verbose: bool
Whether to print out progress information. Default: False
"""
def __init__(self, name, url, raw_dir=None, force_reload=False, verbose=False):
def __init__(self, name, url, raw_dir=None, hash_key=(), force_reload=False, verbose=False):
super(DGLBuiltinDataset, self).__init__(name,
url=url,
raw_dir=raw_dir,
save_dir=None,
hash_key=hash_key,
force_reload=force_reload,
verbose=verbose)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment