Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
47d37e91
Unverified
Commit
47d37e91
authored
Aug 02, 2023
by
Rhett Ying
Committed by
GitHub
Aug 02, 2023
Browse files
[GraphBolt] convert TVT from list of list to list (#6080)
parent
12ade95c
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
435 additions
and
481 deletions
+435
-481
python/dgl/graphbolt/dataset.py
python/dgl/graphbolt/dataset.py
+7
-7
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+85
-93
python/dgl/graphbolt/impl/ondisk_metadata.py
python/dgl/graphbolt/impl/ondisk_metadata.py
+3
-3
tests/python/pytorch/graphbolt/test_dataset.py
tests/python/pytorch/graphbolt/test_dataset.py
+5
-11
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+335
-367
No files found.
python/dgl/graphbolt/dataset.py
View file @
47d37e91
"""GraphBolt Dataset."""
"""GraphBolt Dataset."""
from
typing
import
Dict
,
List
from
typing
import
Dict
from
.feature_store
import
FeatureStore
from
.feature_store
import
FeatureStore
from
.itemset
import
ItemSet
,
ItemSetDict
from
.itemset
import
ItemSet
,
ItemSetDict
...
@@ -32,18 +32,18 @@ class Dataset:
...
@@ -32,18 +32,18 @@ class Dataset:
"""
"""
@
property
@
property
def
train_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
train_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the training set
s
."""
"""Return the training set."""
raise
NotImplementedError
raise
NotImplementedError
@
property
@
property
def
validation_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
validation_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the validation set
s
."""
"""Return the validation set."""
raise
NotImplementedError
raise
NotImplementedError
@
property
@
property
def
test_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
test_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the test set
s
."""
"""Return the test set."""
raise
NotImplementedError
raise
NotImplementedError
@
property
@
property
...
...
python/dgl/graphbolt/impl/ondisk_dataset.py
View file @
47d37e91
...
@@ -165,14 +165,11 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
...
@@ -165,14 +165,11 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
)
)
# 7. Save the train/val/test split according to the output_config.
# 7. Save the train/val/test split according to the output_config.
for
set_name
in
[
"train_set
s
"
,
"validation_set
s
"
,
"test_set
s
"
]:
for
set_name
in
[
"train_set"
,
"validation_set"
,
"test_set"
]:
if
set_name
not
in
input_config
:
if
set_name
not
in
input_config
:
continue
continue
for
intput_set_split
,
output_set_split
in
zip
(
input_config
[
set_name
],
output_config
[
set_name
]
):
for
input_set_per_type
,
output_set_per_type
in
zip
(
for
input_set_per_type
,
output_set_per_type
in
zip
(
in
t
put_
set_split
,
output_set_split
input_
config
[
set_name
],
output_config
[
set_name
]
):
):
for
input_data
,
output_data
in
zip
(
for
input_data
,
output_data
in
zip
(
input_set_per_type
[
"data"
],
output_set_per_type
[
"data"
]
input_set_per_type
[
"data"
],
output_set_per_type
[
"data"
]
...
@@ -245,8 +242,8 @@ class OnDiskDataset(Dataset):
...
@@ -245,8 +242,8 @@ class OnDiskDataset(Dataset):
format: numpy
format: numpy
in_memory: false
in_memory: false
path: edge_data/author-writes-paper-feat.npy
path: edge_data/author-writes-paper-feat.npy
train_set
s
:
train_set:
-
-
type: paper # could be null for homogeneous graph.
- type: paper # could be null for homogeneous graph.
data: # multiple data sources could be specified.
data: # multiple data sources could be specified.
- format: numpy
- format: numpy
in_memory: true # If not specified, default to true.
in_memory: true # If not specified, default to true.
...
@@ -254,14 +251,14 @@ class OnDiskDataset(Dataset):
...
@@ -254,14 +251,14 @@ class OnDiskDataset(Dataset):
- format: numpy
- format: numpy
in_memory: false
in_memory: false
path: set/paper-train-dst.npy
path: set/paper-train-dst.npy
validation_set
s
:
validation_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path: set/paper-validation.npy
path: set/paper-validation.npy
test_set
s
:
test_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -285,24 +282,24 @@ class OnDiskDataset(Dataset):
...
@@ -285,24 +282,24 @@ class OnDiskDataset(Dataset):
self
.
_num_labels
=
self
.
_meta
.
num_labels
self
.
_num_labels
=
self
.
_meta
.
num_labels
self
.
_graph
=
self
.
_load_graph
(
self
.
_meta
.
graph_topology
)
self
.
_graph
=
self
.
_load_graph
(
self
.
_meta
.
graph_topology
)
self
.
_feature
=
load_feature_stores
(
self
.
_meta
.
feature_data
)
self
.
_feature
=
load_feature_stores
(
self
.
_meta
.
feature_data
)
self
.
_train_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
train_set
s
)
self
.
_train_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
train_set
)
self
.
_validation_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
validation_set
s
)
self
.
_validation_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
validation_set
)
self
.
_test_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
test_set
s
)
self
.
_test_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
test_set
)
@
property
@
property
def
train_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
train_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the training set."""
"""Return the training set."""
return
self
.
_train_set
s
return
self
.
_train_set
@
property
@
property
def
validation_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
validation_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the validation set."""
"""Return the validation set."""
return
self
.
_validation_set
s
return
self
.
_validation_set
@
property
@
property
def
test_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
test_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the test set."""
"""Return the test set."""
return
self
.
_test_set
s
return
self
.
_test_set
@
property
@
property
def
graph
(
self
)
->
object
:
def
graph
(
self
)
->
object
:
...
@@ -341,28 +338,23 @@ class OnDiskDataset(Dataset):
...
@@ -341,28 +338,23 @@ class OnDiskDataset(Dataset):
f
"Graph topology type
{
graph_topology
.
type
}
is not supported."
f
"Graph topology type
{
graph_topology
.
type
}
is not supported."
)
)
def
_init_tvt_sets
(
def
_init_tvt_set
(
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
self
,
tvt_set
:
List
[
OnDiskTVTSet
]
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
)
->
ItemSet
or
ItemSetDict
:
"""Initialize the TVT sets."""
"""Initialize the TVT set."""
if
(
tvt_sets
is
None
)
or
(
len
(
tvt_sets
)
==
0
):
ret
=
None
return
None
ret
=
[]
for
tvt_set
in
tvt_sets
:
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
ret
.
append
(
None
)
return
ret
if
tvt_set
[
0
].
type
is
None
:
if
tvt_set
[
0
].
type
is
None
:
assert
(
assert
(
len
(
tvt_set
)
==
1
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type is not specified."
),
"Only one TVT set is allowed if type is not specified."
ret
.
append
(
ret
=
ItemSet
(
ItemSet
(
tuple
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt_set
[
0
].
data
for
data
in
tvt_set
[
0
].
data
)
)
)
)
)
else
:
else
:
data
=
{}
data
=
{}
for
tvt
in
tvt_set
:
for
tvt
in
tvt_set
:
...
@@ -372,5 +364,5 @@ class OnDiskDataset(Dataset):
...
@@ -372,5 +364,5 @@ class OnDiskDataset(Dataset):
for
data
in
tvt
.
data
for
data
in
tvt
.
data
)
)
)
)
ret
.
append
(
ItemSetDict
(
data
)
)
ret
=
ItemSetDict
(
data
)
return
ret
return
ret
python/dgl/graphbolt/impl/ondisk_metadata.py
View file @
47d37e91
...
@@ -83,6 +83,6 @@ class OnDiskMetaData(pydantic.BaseModel):
...
@@ -83,6 +83,6 @@ class OnDiskMetaData(pydantic.BaseModel):
num_labels
:
Optional
[
int
]
=
None
num_labels
:
Optional
[
int
]
=
None
graph_topology
:
Optional
[
OnDiskGraphTopology
]
=
None
graph_topology
:
Optional
[
OnDiskGraphTopology
]
=
None
feature_data
:
Optional
[
List
[
OnDiskFeatureData
]]
=
[]
feature_data
:
Optional
[
List
[
OnDiskFeatureData
]]
=
[]
train_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
train_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
validation_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
validation_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
test_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
test_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
tests/python/pytorch/graphbolt/test_dataset.py
View file @
47d37e91
import
os
import
tempfile
import
numpy
as
np
import
pydantic
import
pytest
import
pytest
from
dgl
import
graphbolt
as
gb
from
dgl
import
graphbolt
as
gb
...
@@ -11,15 +5,15 @@ from dgl import graphbolt as gb
...
@@ -11,15 +5,15 @@ from dgl import graphbolt as gb
def
test_Dataset
():
def
test_Dataset
():
dataset
=
gb
.
Dataset
()
dataset
=
gb
.
Dataset
()
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
train_set
s
()
_
=
dataset
.
train_set
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
validation_set
s
()
_
=
dataset
.
validation_set
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
test_set
s
()
_
=
dataset
.
test_set
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
graph
()
_
=
dataset
.
graph
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
feature
()
_
=
dataset
.
feature
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
dataset_name
_
=
dataset
.
dataset_name
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
...
...
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
View file @
47d37e91
...
@@ -20,8 +20,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
...
@@ -20,8 +20,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
# Case 1: ``format`` is invalid.
# Case 1: ``format`` is invalid.
yaml_content
=
"""
yaml_content
=
"""
train_set
s
:
train_set:
-
-
type: paper
- type: paper
data:
data:
- format: torch_invalid
- format: torch_invalid
path: set/paper-train.pt
path: set/paper-train.pt
...
@@ -34,8 +34,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
...
@@ -34,8 +34,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
# Case 2: ``type`` is not specified while multiple TVT sets are specified.
# Case 2: ``type`` is not specified while multiple TVT sets are specified.
yaml_content
=
"""
yaml_content
=
"""
train_set
s
:
train_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
path: set/train.npy
path: set/train.npy
...
@@ -82,8 +82,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -82,8 +82,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# ``type`` is not specified or specified as ``null``.
# ``type`` is not specified or specified as ``null``.
# ``in_memory`` could be ``true`` and ``false``.
# ``in_memory`` could be ``true`` and ``false``.
yaml_content
=
f
"""
yaml_content
=
f
"""
train_set
s
:
train_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -91,16 +91,16 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -91,16 +91,16 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
train_labels_path
}
path:
{
train_labels_path
}
validation_set
s
:
validation_set:
-
-
data:
- data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
validation_ids_path
}
path:
{
validation_ids_path
}
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
validation_labels_path
}
path:
{
validation_labels_path
}
test_set
s
:
test_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -116,43 +116,37 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -116,43 +116,37 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_set
=
dataset
.
train_set
assert
len
(
train_sets
)
==
1
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
train_set
):
for
i
,
(
id
,
label
)
in
enumerate
(
train_set
):
assert
id
==
train_ids
[
i
]
assert
id
==
train_ids
[
i
]
assert
label
==
train_labels
[
i
]
assert
label
==
train_labels
[
i
]
train_set
s
=
None
train_set
=
None
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_set
=
dataset
.
validation_set
assert
len
(
validation_sets
)
==
1
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
validation_set
):
for
i
,
(
id
,
label
)
in
enumerate
(
validation_set
):
assert
id
==
validation_ids
[
i
]
assert
id
==
validation_ids
[
i
]
assert
label
==
validation_labels
[
i
]
assert
label
==
validation_labels
[
i
]
validation_set
s
=
None
validation_set
=
None
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_set
=
dataset
.
test_set
assert
len
(
test_sets
)
==
1
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
id
,
label
)
in
enumerate
(
test_set
):
for
i
,
(
id
,
label
)
in
enumerate
(
test_set
):
assert
id
==
test_ids
[
i
]
assert
id
==
test_ids
[
i
]
assert
label
==
test_labels
[
i
]
assert
label
==
test_labels
[
i
]
test_set
s
=
None
test_set
=
None
dataset
=
None
dataset
=
None
# Case 2: Some TVT sets are None.
# Case 2: Some TVT sets are None.
yaml_content
=
f
"""
yaml_content
=
f
"""
train_set
s
:
train_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
path:
{
train_ids_path
}
path:
{
train_ids_path
}
...
@@ -162,9 +156,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -162,9 +156,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
f
.
write
(
yaml_content
)
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
assert
dataset
.
train_set
s
is
not
None
assert
dataset
.
train_set
is
not
None
assert
dataset
.
validation_set
s
is
None
assert
dataset
.
validation_set
is
None
assert
dataset
.
test_set
s
is
None
assert
dataset
.
test_set
is
None
dataset
=
None
dataset
=
None
...
@@ -202,8 +196,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
...
@@ -202,8 +196,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
np
.
save
(
test_labels_path
,
test_labels
)
np
.
save
(
test_labels_path
,
test_labels
)
yaml_content
=
f
"""
yaml_content
=
f
"""
train_set
s
:
train_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -214,8 +208,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
...
@@ -214,8 +208,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
train_labels_path
}
path:
{
train_labels_path
}
validation_set
s
:
validation_set:
-
-
data:
- data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
validation_src_path
}
path:
{
validation_src_path
}
...
@@ -225,8 +219,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
...
@@ -225,8 +219,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
validation_labels_path
}
path:
{
validation_labels_path
}
test_set
s
:
test_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -245,40 +239,34 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
...
@@ -245,40 +239,34 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_set
=
dataset
.
train_set
assert
len
(
train_sets
)
==
1
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
train_set
):
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
train_set
):
assert
src
==
train_src
[
i
]
assert
src
==
train_src
[
i
]
assert
dst
==
train_dst
[
i
]
assert
dst
==
train_dst
[
i
]
assert
label
==
train_labels
[
i
]
assert
label
==
train_labels
[
i
]
train_set
s
=
None
train_set
=
None
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_set
=
dataset
.
validation_set
assert
len
(
validation_sets
)
==
1
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
validation_set
):
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
validation_set
):
assert
src
==
validation_src
[
i
]
assert
src
==
validation_src
[
i
]
assert
dst
==
validation_dst
[
i
]
assert
dst
==
validation_dst
[
i
]
assert
label
==
validation_labels
[
i
]
assert
label
==
validation_labels
[
i
]
validation_set
s
=
None
validation_set
=
None
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_set
=
dataset
.
test_set
assert
len
(
test_sets
)
==
1
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
test_set
):
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
test_set
):
assert
src
==
test_src
[
i
]
assert
src
==
test_src
[
i
]
assert
dst
==
test_dst
[
i
]
assert
dst
==
test_dst
[
i
]
assert
label
==
test_labels
[
i
]
assert
label
==
test_labels
[
i
]
test_set
s
=
None
test_set
=
None
dataset
=
None
dataset
=
None
...
@@ -320,8 +308,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
...
@@ -320,8 +308,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
np
.
save
(
test_neg_dst_path
,
test_neg_dst
)
np
.
save
(
test_neg_dst_path
,
test_neg_dst
)
yaml_content
=
f
"""
yaml_content
=
f
"""
train_set
s
:
train_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -332,8 +320,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
...
@@ -332,8 +320,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
train_neg_dst_path
}
path:
{
train_neg_dst_path
}
validation_set
s
:
validation_set:
-
-
data:
- data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
validation_src_path
}
path:
{
validation_src_path
}
...
@@ -343,8 +331,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
...
@@ -343,8 +331,8 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
validation_neg_dst_path
}
path:
{
validation_neg_dst_path
}
test_set
s
:
test_set:
-
-
type: null
- type: null
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
...
@@ -363,42 +351,34 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
...
@@ -363,42 +351,34 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_set
=
dataset
.
train_set
assert
len
(
train_sets
)
==
1
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
train_set
):
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
train_set
):
assert
src
==
train_src
[
i
]
assert
src
==
train_src
[
i
]
assert
dst
==
train_dst
[
i
]
assert
dst
==
train_dst
[
i
]
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
train_neg_dst
[
i
]))
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
train_neg_dst
[
i
]))
train_set
s
=
None
train_set
=
None
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_set
=
dataset
.
validation_set
assert
len
(
validation_sets
)
==
1
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
validation_set
):
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
validation_set
):
assert
src
==
validation_src
[
i
]
assert
src
==
validation_src
[
i
]
assert
dst
==
validation_dst
[
i
]
assert
dst
==
validation_dst
[
i
]
assert
torch
.
equal
(
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
validation_neg_dst
[
i
]))
negs
,
torch
.
from_numpy
(
validation_neg_dst
[
i
])
validation_set
=
None
)
validation_sets
=
None
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_set
=
dataset
.
test_set
assert
len
(
test_sets
)
==
1
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
test_set
):
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
test_set
):
assert
src
==
test_src
[
i
]
assert
src
==
test_src
[
i
]
assert
dst
==
test_dst
[
i
]
assert
dst
==
test_dst
[
i
]
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
test_neg_dst
[
i
]))
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
test_neg_dst
[
i
]))
test_set
s
=
None
test_set
=
None
dataset
=
None
dataset
=
None
...
@@ -424,32 +404,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
...
@@ -424,32 +404,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
np
.
save
(
test_path
,
test_data
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
yaml_content
=
f
"""
train_set
s
:
train_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
train_path
}
path:
{
train_path
}
-
-
type: author
- type: author
data:
data:
- format: numpy
- format: numpy
path:
{
train_path
}
path:
{
train_path
}
validation_set
s
:
validation_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
-
-
type: author
- type: author
data:
data:
- format: numpy
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
test_set
s
:
test_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: false
in_memory: false
path:
{
test_path
}
path:
{
test_path
}
-
-
type: author
- type: author
data:
data:
- format: numpy
- format: numpy
path:
{
test_path
}
path:
{
test_path
}
...
@@ -461,10 +441,8 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
...
@@ -461,10 +441,8 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_set
=
dataset
.
train_set
assert
len
(
train_sets
)
==
2
assert
len
(
train_set
)
==
2000
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
train_set
):
for
i
,
item
in
enumerate
(
train_set
):
assert
isinstance
(
item
,
dict
)
assert
isinstance
(
item
,
dict
)
...
@@ -472,41 +450,37 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
...
@@ -472,41 +450,37 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
key
=
list
(
item
.
keys
())[
0
]
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
id
,
label
=
item
[
key
]
assert
id
==
train_ids
[
i
]
assert
id
==
train_ids
[
i
%
1000
]
assert
label
==
train_labels
[
i
]
assert
label
==
train_labels
[
i
%
1000
]
train_set
s
=
None
train_set
=
None
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_set
=
dataset
.
validation_set
assert
len
(
validation_sets
)
==
2
assert
len
(
validation_set
)
==
2000
for
validation_set
in
validation_sets
:
assert
isinstance
(
validation_set
,
gb
.
ItemSetDict
)
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
validation_set
):
for
i
,
item
in
enumerate
(
validation_set
):
assert
isinstance
(
item
,
dict
)
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
id
,
label
=
item
[
key
]
assert
id
==
validation_ids
[
i
]
assert
id
==
validation_ids
[
i
%
1000
]
assert
label
==
validation_labels
[
i
]
assert
label
==
validation_labels
[
i
%
1000
]
validation_set
s
=
None
validation_set
=
None
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_set
=
dataset
.
test_set
assert
len
(
test_sets
)
==
2
assert
len
(
test_set
)
==
2000
for
test_set
in
test_sets
:
assert
isinstance
(
test_set
,
gb
.
ItemSetDict
)
assert
len
(
test_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
test_set
):
for
i
,
item
in
enumerate
(
test_set
):
assert
isinstance
(
item
,
dict
)
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
assert
key
in
[
"paper"
,
"author"
]
id
,
label
=
item
[
key
]
id
,
label
=
item
[
key
]
assert
id
==
test_ids
[
i
]
assert
id
==
test_ids
[
i
%
1000
]
assert
label
==
test_labels
[
i
]
assert
label
==
test_labels
[
i
%
1000
]
test_set
s
=
None
test_set
=
None
dataset
=
None
dataset
=
None
...
@@ -532,32 +506,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
...
@@ -532,32 +506,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
np
.
save
(
test_path
,
test_data
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
yaml_content
=
f
"""
train_set
s
:
train_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path:
{
train_path
}
path:
{
train_path
}
-
-
type: author
- type: author
data:
data:
- format: numpy
- format: numpy
path:
{
train_path
}
path:
{
train_path
}
validation_set
s
:
validation_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
-
-
type: author
- type: author
data:
data:
- format: numpy
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
test_set
s
:
test_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: false
in_memory: false
path:
{
test_path
}
path:
{
test_path
}
-
-
type: author
- type: author
data:
data:
- format: numpy
- format: numpy
path:
{
test_path
}
path:
{
test_path
}
...
@@ -569,10 +543,8 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
...
@@ -569,10 +543,8 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_set
=
dataset
.
train_set
assert
len
(
train_sets
)
==
2
assert
len
(
train_set
)
==
2000
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
train_set
):
for
i
,
item
in
enumerate
(
train_set
):
assert
isinstance
(
item
,
dict
)
assert
isinstance
(
item
,
dict
)
...
@@ -580,44 +552,40 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
...
@@ -580,44 +552,40 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
key
=
list
(
item
.
keys
())[
0
]
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
train_pairs
[
0
][
i
]
assert
src
==
train_pairs
[
0
][
i
%
1000
]
assert
dst
==
train_pairs
[
1
][
i
]
assert
dst
==
train_pairs
[
1
][
i
%
1000
]
assert
label
==
train_labels
[
i
]
assert
label
==
train_labels
[
i
%
1000
]
train_set
s
=
None
train_set
=
None
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_set
=
dataset
.
validation_set
assert
len
(
validation_sets
)
==
2
assert
len
(
validation_set
)
==
2000
for
validation_set
in
validation_sets
:
assert
isinstance
(
validation_set
,
gb
.
ItemSetDict
)
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
validation_set
):
for
i
,
item
in
enumerate
(
validation_set
):
assert
isinstance
(
item
,
dict
)
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
validation_pairs
[
0
][
i
]
assert
src
==
validation_pairs
[
0
][
i
%
1000
]
assert
dst
==
validation_pairs
[
1
][
i
]
assert
dst
==
validation_pairs
[
1
][
i
%
1000
]
assert
label
==
validation_labels
[
i
]
assert
label
==
validation_labels
[
i
%
1000
]
validation_set
s
=
None
validation_set
=
None
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_set
=
dataset
.
test_set
assert
len
(
test_sets
)
==
2
assert
len
(
test_set
)
==
2000
for
test_set
in
test_sets
:
assert
isinstance
(
test_set
,
gb
.
ItemSetDict
)
assert
len
(
test_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSetDict
)
for
i
,
item
in
enumerate
(
test_set
):
for
i
,
item
in
enumerate
(
test_set
):
assert
isinstance
(
item
,
dict
)
assert
isinstance
(
item
,
dict
)
assert
len
(
item
)
==
1
assert
len
(
item
)
==
1
key
=
list
(
item
.
keys
())[
0
]
key
=
list
(
item
.
keys
())[
0
]
assert
key
in
[
"paper"
,
"author"
]
assert
key
in
[
"paper"
,
"author"
]
src
,
dst
,
label
=
item
[
key
]
src
,
dst
,
label
=
item
[
key
]
assert
src
==
test_pairs
[
0
][
i
]
assert
src
==
test_pairs
[
0
][
i
%
1000
]
assert
dst
==
test_pairs
[
1
][
i
]
assert
dst
==
test_pairs
[
1
][
i
%
1000
]
assert
label
==
test_labels
[
i
]
assert
label
==
test_labels
[
i
%
1000
]
test_set
s
=
None
test_set
=
None
dataset
=
None
dataset
=
None
...
@@ -995,18 +963,18 @@ def test_OnDiskDataset_preprocess_homogeneous():
...
@@ -995,18 +963,18 @@ def test_OnDiskDataset_preprocess_homogeneous():
format: numpy
format: numpy
in_memory: false
in_memory: false
path: data/node-feat.npy
path: data/node-feat.npy
train_set
s
:
train_set:
-
-
type_name: null
- type_name: null
data:
data:
- format: numpy
- format: numpy
path: set/train.npy
path: set/train.npy
validation_set
s
:
validation_set:
-
-
type_name: null
- type_name: null
data:
data:
- format: numpy
- format: numpy
path: set/validation.npy
path: set/validation.npy
test_set
s
:
test_set:
-
-
type_name: null
- type_name: null
data:
data:
- format: numpy
- format: numpy
path: set/test.npy
path: set/test.npy
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment