Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
90a308f3
Unverified
Commit
90a308f3
authored
Jul 07, 2023
by
Rhett Ying
Committed by
GitHub
Jul 07, 2023
Browse files
[GraphBolt] split OnDiskDataset into separate file (#5963)
parent
dc90ea16
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
586 additions
and
566 deletions
+586
-566
python/dgl/graphbolt/__init__.py
python/dgl/graphbolt/__init__.py
+1
-0
python/dgl/graphbolt/dataset.py
python/dgl/graphbolt/dataset.py
+2
-126
python/dgl/graphbolt/impl/__init__.py
python/dgl/graphbolt/impl/__init__.py
+2
-0
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+133
-0
tests/python/pytorch/graphbolt/test_dataset.py
tests/python/pytorch/graphbolt/test_dataset.py
+0
-440
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+448
-0
No files found.
python/dgl/graphbolt/__init__.py
View file @
90a308f3
...
@@ -12,6 +12,7 @@ from .feature_store import *
...
@@ -12,6 +12,7 @@ from .feature_store import *
from
.feature_fetcher
import
*
from
.feature_fetcher
import
*
from
.copy_to
import
*
from
.copy_to
import
*
from
.dataset
import
*
from
.dataset
import
*
from
.impl
import
*
from
.dataloader
import
*
from
.dataloader
import
*
from
.subgraph_sampler
import
*
from
.subgraph_sampler
import
*
...
...
python/dgl/graphbolt/dataset.py
View file @
90a308f3
"""GraphBolt Dataset."""
"""GraphBolt Dataset."""
from
typing
import
List
,
Optional
from
typing
import
List
import
pydantic
import
pydantic_yaml
from
.feature_store
import
FeatureStore
from
.feature_store
import
FeatureStore
from
.itemset
import
ItemSet
,
ItemSetDict
from
.itemset
import
ItemSet
,
ItemSetDict
from
.utils
import
read_data
,
tensor_to_tuple
__all__
=
[
"Dataset"
,
"OnDiskDataset"
]
__all__
=
[
"Dataset"
]
class
Dataset
:
class
Dataset
:
...
@@ -54,123 +50,3 @@ class Dataset:
...
@@ -54,123 +50,3 @@ class Dataset:
def
feature
(
self
)
->
FeatureStore
:
def
feature
(
self
)
->
FeatureStore
:
"""Return the feature."""
"""Return the feature."""
raise
NotImplementedError
raise
NotImplementedError
class
OnDiskDataFormatEnum
(
pydantic_yaml
.
YamlStrEnum
):
"""Enum of data format."""
TORCH
=
"torch"
NUMPY
=
"numpy"
class
OnDiskTVTSet
(
pydantic
.
BaseModel
):
"""Train-Validation-Test set."""
type_name
:
Optional
[
str
]
format
:
OnDiskDataFormatEnum
in_memory
:
Optional
[
bool
]
=
True
path
:
str
class
OnDiskMetaData
(
pydantic_yaml
.
YamlModel
):
"""Metadata specification in YAML.
As multiple node/edge types and multiple splits are supported, each TVT set
is a list of list of ``OnDiskTVTSet``.
"""
train_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
validation_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
test_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
class
OnDiskDataset
(
Dataset
):
"""An on-disk dataset.
An on-disk dataset is a dataset which reads graph topology, feature data
and TVT set from disk. Due to limited resources, the data which are too
large to fit into RAM will remain on disk while others reside in RAM once
``OnDiskDataset`` is initialized. This behavior could be controled by user
via ``in_memory`` field in YAML file.
A full example of YAML file is as follows:
.. code-block:: yaml
train_sets:
- - type_name: paper # could be null for homogeneous graph.
format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train.npy
validation_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-validation.npy
test_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-test.npy
Parameters
----------
path: str
The YAML file path.
"""
def
__init__
(
self
,
path
:
str
)
->
None
:
with
open
(
path
,
"r"
)
as
f
:
self
.
_meta
=
OnDiskMetaData
.
parse_raw
(
f
.
read
(),
proto
=
"yaml"
)
self
.
_train_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
train_sets
)
self
.
_validation_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
validation_sets
)
self
.
_test_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
test_sets
)
def
train_sets
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Return the training set."""
return
self
.
_train_sets
def
validation_sets
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Return the validation set."""
return
self
.
_validation_sets
def
test_sets
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Return the test set."""
return
self
.
_test_sets
def
graph
(
self
)
->
object
:
"""Return the graph."""
raise
NotImplementedError
def
feature
(
self
)
->
FeatureStore
:
"""Return the feature."""
raise
NotImplementedError
def
_init_tvt_sets
(
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Initialize the TVT sets."""
if
(
tvt_sets
is
None
)
or
(
len
(
tvt_sets
)
==
0
):
return
None
ret
=
[]
for
tvt_set
in
tvt_sets
:
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
ret
.
append
(
None
)
if
tvt_set
[
0
].
type_name
is
None
:
assert
(
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type_name is not specified."
data
=
read_data
(
tvt_set
[
0
].
path
,
tvt_set
[
0
].
format
,
tvt_set
[
0
].
in_memory
)
ret
.
append
(
ItemSet
(
tensor_to_tuple
(
data
)))
else
:
data
=
{}
for
tvt
in
tvt_set
:
data
[
tvt
.
type_name
]
=
ItemSet
(
tensor_to_tuple
(
read_data
(
tvt
.
path
,
tvt
.
format
,
tvt
.
in_memory
)
)
)
ret
.
append
(
ItemSetDict
(
data
))
return
ret
python/dgl/graphbolt/impl/__init__.py
0 → 100644
View file @
90a308f3
"""Implementation of GraphBolt."""
from
.ondisk_dataset
import
*
python/dgl/graphbolt/impl/ondisk_dataset.py
0 → 100644
View file @
90a308f3
"""GraphBolt OnDiskDataset."""
from
typing
import
List
,
Optional
import
pydantic
import
pydantic_yaml
from
..dataset
import
Dataset
from
..feature_store
import
FeatureStore
from
..itemset
import
ItemSet
,
ItemSetDict
from
..utils
import
read_data
,
tensor_to_tuple
__all__
=
[
"OnDiskDataset"
]
class
OnDiskDataFormatEnum
(
pydantic_yaml
.
YamlStrEnum
):
"""Enum of data format."""
TORCH
=
"torch"
NUMPY
=
"numpy"
class
OnDiskTVTSet
(
pydantic
.
BaseModel
):
"""Train-Validation-Test set."""
type_name
:
Optional
[
str
]
format
:
OnDiskDataFormatEnum
in_memory
:
Optional
[
bool
]
=
True
path
:
str
class
OnDiskMetaData
(
pydantic_yaml
.
YamlModel
):
"""Metadata specification in YAML.
As multiple node/edge types and multiple splits are supported, each TVT set
is a list of list of ``OnDiskTVTSet``.
"""
train_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
validation_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
test_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
class
OnDiskDataset
(
Dataset
):
"""An on-disk dataset.
An on-disk dataset is a dataset which reads graph topology, feature data
and TVT set from disk. Due to limited resources, the data which are too
large to fit into RAM will remain on disk while others reside in RAM once
``OnDiskDataset`` is initialized. This behavior could be controled by user
via ``in_memory`` field in YAML file.
A full example of YAML file is as follows:
.. code-block:: yaml
train_sets:
- - type_name: paper # could be null for homogeneous graph.
format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train.npy
validation_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-validation.npy
test_sets:
- - type_name: paper
format: numpy
in_memory: true
path: set/paper-test.npy
Parameters
----------
path: str
The YAML file path.
"""
def
__init__
(
self
,
path
:
str
)
->
None
:
with
open
(
path
,
"r"
)
as
f
:
self
.
_meta
=
OnDiskMetaData
.
parse_raw
(
f
.
read
(),
proto
=
"yaml"
)
self
.
_train_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
train_sets
)
self
.
_validation_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
validation_sets
)
self
.
_test_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
test_sets
)
def
train_sets
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Return the training set."""
return
self
.
_train_sets
def
validation_sets
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Return the validation set."""
return
self
.
_validation_sets
def
test_sets
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Return the test set."""
return
self
.
_test_sets
def
graph
(
self
)
->
object
:
"""Return the graph."""
raise
NotImplementedError
def
feature
(
self
)
->
FeatureStore
:
"""Return the feature."""
raise
NotImplementedError
def
_init_tvt_sets
(
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Initialize the TVT sets."""
if
(
tvt_sets
is
None
)
or
(
len
(
tvt_sets
)
==
0
):
return
None
ret
=
[]
for
tvt_set
in
tvt_sets
:
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
ret
.
append
(
None
)
if
tvt_set
[
0
].
type_name
is
None
:
assert
(
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type_name is not specified."
data
=
read_data
(
tvt_set
[
0
].
path
,
tvt_set
[
0
].
format
,
tvt_set
[
0
].
in_memory
)
ret
.
append
(
ItemSet
(
tensor_to_tuple
(
data
)))
else
:
data
=
{}
for
tvt
in
tvt_set
:
data
[
tvt
.
type_name
]
=
ItemSet
(
tensor_to_tuple
(
read_data
(
tvt
.
path
,
tvt
.
format
,
tvt
.
in_memory
)
)
)
ret
.
append
(
ItemSetDict
(
data
))
return
ret
tests/python/pytorch/graphbolt/test_dataset.py
View file @
90a308f3
...
@@ -20,443 +20,3 @@ def test_Dataset():
...
@@ -20,443 +20,3 @@ def test_Dataset():
_
=
dataset
.
graph
()
_
=
dataset
.
graph
()
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
feature
()
_
=
dataset
.
feature
()
def
test_OnDiskDataset_TVTSet_exceptions
():
"""Test excpetions thrown when parsing TVTSet."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
# Case 1: ``format`` is invalid.
yaml_content
=
"""
train_sets:
- - type_name: paper
format: torch_invalid
path: set/paper-train.pt
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
with
pytest
.
raises
(
pydantic
.
ValidationError
):
_
=
gb
.
OnDiskDataset
(
yaml_file
)
# Case 2: ``type_name`` is not specified while multiple TVT sets are specified.
yaml_content
=
"""
train_sets:
- - type_name: null
format: numpy
path: set/train.npy
- type_name: null
format: numpy
path: set/train.npy
"""
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
with
pytest
.
raises
(
AssertionError
,
match
=
r
"Only one TVT set is allowed if type_name is not specified."
,
):
_
=
gb
.
OnDiskDataset
(
yaml_file
)
def
test_OnDiskDataset_TVTSet_ItemSet_id_label
():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_ids
=
np
.
arange
(
1000
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_ids
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_ids
=
np
.
arange
(
1000
,
2000
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_ids
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_ids
=
np
.
arange
(
2000
,
3000
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_ids
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
# Case 1:
# all TVT sets are specified.
# ``type_name`` is not specified or specified as ``null``.
# ``in_memory`` could be ``true`` and ``false``.
yaml_content
=
f
"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: null
format: numpy
path:
{
train_path
}
validation_sets:
- - format: numpy
path:
{
validation_path
}
- - type_name: null
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: null
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
train_set
):
assert
id
==
train_ids
[
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
validation_set
):
assert
id
==
validation_ids
[
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
test_set
):
assert
id
==
test_ids
[
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
# Case 2: Some TVT sets are None.
yaml_content
=
f
"""
train_sets:
- - type_name: null
format: numpy
path:
{
train_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
assert
dataset
.
train_sets
()
is
not
None
assert
dataset
.
validation_sets
()
is
None
assert
dataset
.
test_sets
()
is
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSet_node_pair_label
():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_pairs
=
(
np
.
arange
(
1000
),
np
.
arange
(
1000
,
2000
))
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_pairs
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_pairs
=
(
np
.
arange
(
1000
,
2000
),
np
.
arange
(
2000
,
3000
))
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_pairs
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_pairs
=
(
np
.
arange
(
2000
,
3000
),
np
.
arange
(
3000
,
4000
))
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_pairs
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: null
format: numpy
path:
{
train_path
}
validation_sets:
- - format: numpy
path:
{
validation_path
}
- - type_name: null
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: null
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
train_set
):
assert
src
==
train_pairs
[
0
][
i
]
assert
dst
==
train_pairs
[
1
][
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
validation_set
):
assert
src
==
validation_pairs
[
0
][
i
]
assert
dst
==
validation_pairs
[
1
][
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
test_set
):
assert
src
==
test_pairs
[
0
][
i
]
assert
dst
==
test_pairs
[
1
][
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSetDict_id_label
():
"""Test TVTSet which returns ItemSetDict with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_ids
=
np
.
arange
(
1000
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_ids
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_ids
=
np
.
arange
(
1000
,
2000
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_ids
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_ids
=
np
.
arange
(
2000
,
3000
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_ids
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: author
format: numpy
path:
{
train_path
}
validation_sets:
- - type_name: paper
format: numpy
path:
{
validation_path
}
- - type_name: author
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: author
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
train_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
assert
id
==
train_ids
[
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
validation_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
assert
id
==
validation_ids
[
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
test_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
assert
id
==
test_ids
[
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label
():
"""Test TVTSet which returns ItemSetDict with node pairs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_pairs
=
(
np
.
arange
(
1000
),
np
.
arange
(
1000
,
2000
))
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_pairs
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_pairs
=
(
np
.
arange
(
1000
,
2000
),
np
.
arange
(
2000
,
3000
))
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_pairs
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_pairs
=
(
np
.
arange
(
2000
,
3000
),
np
.
arange
(
3000
,
4000
))
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_pairs
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: author
format: numpy
path:
{
train_path
}
validation_sets:
- - type_name: paper
format: numpy
path:
{
validation_path
}
- - type_name: author
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: author
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
train_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
train_pairs
[
0
][
i
]
assert
dst
==
train_pairs
[
1
][
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
validation_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
validation_pairs
[
0
][
i
]
assert
dst
==
validation_pairs
[
1
][
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
test_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
test_pairs
[
0
][
i
]
assert
dst
==
test_pairs
[
1
][
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
0 → 100644
View file @
90a308f3
import
os
import
tempfile
import
numpy
as
np
import
pydantic
import
pytest
from
dgl
import
graphbolt
as
gb
def
test_OnDiskDataset_TVTSet_exceptions
():
"""Test excpetions thrown when parsing TVTSet."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
# Case 1: ``format`` is invalid.
yaml_content
=
"""
train_sets:
- - type_name: paper
format: torch_invalid
path: set/paper-train.pt
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
with
pytest
.
raises
(
pydantic
.
ValidationError
):
_
=
gb
.
OnDiskDataset
(
yaml_file
)
# Case 2: ``type_name`` is not specified while multiple TVT sets are specified.
yaml_content
=
"""
train_sets:
- - type_name: null
format: numpy
path: set/train.npy
- type_name: null
format: numpy
path: set/train.npy
"""
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
with
pytest
.
raises
(
AssertionError
,
match
=
r
"Only one TVT set is allowed if type_name is not specified."
,
):
_
=
gb
.
OnDiskDataset
(
yaml_file
)
def
test_OnDiskDataset_TVTSet_ItemSet_id_label
():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_ids
=
np
.
arange
(
1000
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_ids
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_ids
=
np
.
arange
(
1000
,
2000
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_ids
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_ids
=
np
.
arange
(
2000
,
3000
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_ids
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
# Case 1:
# all TVT sets are specified.
# ``type_name`` is not specified or specified as ``null``.
# ``in_memory`` could be ``true`` and ``false``.
yaml_content
=
f
"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: null
format: numpy
path:
{
train_path
}
validation_sets:
- - format: numpy
path:
{
validation_path
}
- - type_name: null
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: null
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
train_set
):
assert
id
==
train_ids
[
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
validation_set
):
assert
id
==
validation_ids
[
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
test_set
):
assert
id
==
test_ids
[
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
# Case 2: Some TVT sets are None.
yaml_content
=
f
"""
train_sets:
- - type_name: null
format: numpy
path:
{
train_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
assert
dataset
.
train_sets
()
is
not
None
assert
dataset
.
validation_sets
()
is
None
assert
dataset
.
test_sets
()
is
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSet_node_pair_label
():
"""Test TVTSet which returns ItemSet with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_pairs
=
(
np
.
arange
(
1000
),
np
.
arange
(
1000
,
2000
))
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_pairs
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_pairs
=
(
np
.
arange
(
1000
,
2000
),
np
.
arange
(
2000
,
3000
))
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_pairs
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_pairs
=
(
np
.
arange
(
2000
,
3000
),
np
.
arange
(
3000
,
4000
))
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_pairs
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
train_sets:
- - type_name: null
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: null
format: numpy
path:
{
train_path
}
validation_sets:
- - format: numpy
path:
{
validation_path
}
- - type_name: null
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: null
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: null
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
train_set
):
assert
src
==
train_pairs
[
0
][
i
]
assert
dst
==
train_pairs
[
1
][
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
validation_set
):
assert
src
==
validation_pairs
[
0
][
i
]
assert
dst
==
validation_pairs
[
1
][
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
test_set
):
assert
src
==
test_pairs
[
0
][
i
]
assert
dst
==
test_pairs
[
1
][
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSetDict_id_label
():
"""Test TVTSet which returns ItemSetDict with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_ids
=
np
.
arange
(
1000
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_ids
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_ids
=
np
.
arange
(
1000
,
2000
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_ids
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_ids
=
np
.
arange
(
2000
,
3000
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_ids
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: author
format: numpy
path:
{
train_path
}
validation_sets:
- - type_name: paper
format: numpy
path:
{
validation_path
}
- - type_name: author
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: author
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
train_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
assert
id
==
train_ids
[
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
validation_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
assert
id
==
validation_ids
[
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
test_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
assert
id
==
test_ids
[
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label
():
"""Test TVTSet which returns ItemSetDict with node pairs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_pairs
=
(
np
.
arange
(
1000
),
np
.
arange
(
1000
,
2000
))
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_pairs
,
train_labels
]).
T
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_path
,
train_data
)
validation_pairs
=
(
np
.
arange
(
1000
,
2000
),
np
.
arange
(
2000
,
3000
))
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_pairs
,
validation_labels
]).
T
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_path
,
validation_data
)
test_pairs
=
(
np
.
arange
(
2000
,
3000
),
np
.
arange
(
3000
,
4000
))
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_pairs
,
test_labels
]).
T
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
train_sets:
- - type_name: paper
format: numpy
in_memory: true
path:
{
train_path
}
- - type_name: author
format: numpy
path:
{
train_path
}
validation_sets:
- - type_name: paper
format: numpy
path:
{
validation_path
}
- - type_name: author
format: numpy
path:
{
validation_path
}
test_sets:
- - type_name: paper
format: numpy
in_memory: false
path:
{
test_path
}
- - type_name: author
format: numpy
path:
{
test_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
()
assert
len
(
train_sets
)
==
2
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
train_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
train_pairs
[
0
][
i
]
assert
dst
==
train_pairs
[
1
][
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
()
assert
len
(
validation_sets
)
==
2
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
validation_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
validation_pairs
[
0
][
i
]
assert
dst
==
validation_pairs
[
1
][
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
()
assert
len
(
test_sets
)
==
2
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
test_set
):
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
test_pairs
[
0
][
i
]
assert
dst
==
test_pairs
[
1
][
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
dataset
=
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment