Unverified Commit b5be4f4b authored by Tong He's avatar Tong He Committed by GitHub
Browse files

[Dataset] Update dataset hash with deterministic function (#1919)

* update hash with deterministic function

* update docstring

* add test on determination
parent 39ed0966
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
from __future__ import absolute_import from __future__ import absolute_import
import os, sys import os, sys, hashlib
import abc import abc
from .utils import download, extract_archive, get_download_dir, makedirs from .utils import download, extract_archive, get_download_dir, makedirs
from ..utils import retry_method_with_fix from ..utils import retry_method_with_fix
...@@ -43,7 +43,7 @@ class DGLDataset(object): ...@@ -43,7 +43,7 @@ class DGLDataset(object):
A tuple of values as the input for the hash function. A tuple of values as the input for the hash function.
Users can distinguish instances (and their caches on the disk) Users can distinguish instances (and their caches on the disk)
from the same dataset class by comparing the hash values. from the same dataset class by comparing the hash values.
Default: (), the corresponding hash value is 3527539 Default: (), the corresponding hash value is 'f9065fa7'.
force_reload : bool force_reload : bool
Whether to reload the dataset. Default: False Whether to reload the dataset. Default: False
verbose : bool verbose : bool
...@@ -56,6 +56,7 @@ class DGLDataset(object): ...@@ -56,6 +56,7 @@ class DGLDataset(object):
self._force_reload = force_reload self._force_reload = force_reload
self._verbose = verbose self._verbose = verbose
self._hash_key = hash_key self._hash_key = hash_key
self._hash_func = hashlib.sha1()
self._hash = self._get_hash() self._hash = self._get_hash()
# if no dir is provided, the default dgl download dir is used. # if no dir is provided, the default dgl download dir is used.
...@@ -161,11 +162,14 @@ class DGLDataset(object): ...@@ -161,11 +162,14 @@ class DGLDataset(object):
Example Example
------- -------
>>> hash_value = self._get_hash((10, False, True)) Assume `self._hash_key = (10, False, True)`
>>> hash_value = self._get_hash()
>>> hash_value >>> hash_value
6299899980521991026 'a770b222'
""" """
return abs(hash(self._hash_key)) self._hash_func.update(str(self._hash_key).encode('utf-8'))
return self._hash_func.hexdigest()[:8]
@property @property
def url(self): def url(self):
......
...@@ -5,5 +5,19 @@ def test_minigc(): ...@@ -5,5 +5,19 @@ def test_minigc():
g, l = list(zip(*ds)) g, l = list(zip(*ds))
print(g, l) print(g, l)
def test_data_hash():
class HashTestDataset(data.DGLDataset):
def __init__(self, hash_key=()):
super(HashTestDataset, self).__init__('hashtest', hash_key=hash_key)
def _load(self):
pass
a = HashTestDataset((True, 0, '1', (1,2,3)))
b = HashTestDataset((True, 0, '1', (1,2,3)))
c = HashTestDataset((True, 0, '1', (1,2,4)))
assert a.hash == b.hash
assert a.hash != c.hash
if __name__ == '__main__': if __name__ == '__main__':
test_minigc() test_minigc()
test_data_hash()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment