Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
14f396d0
Unverified
Commit
14f396d0
authored
Aug 01, 2023
by
Rhett Ying
Committed by
GitHub
Aug 01, 2023
Browse files
[GraphBolt] change TVT format of OnDiskDataset (#6076)
parent
17f6c4c9
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
347 additions
and
163 deletions
+347
-163
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+55
-42
python/dgl/graphbolt/impl/ondisk_metadata.py
python/dgl/graphbolt/impl/ondisk_metadata.py
+10
-3
python/dgl/graphbolt/utils/internal.py
python/dgl/graphbolt/utils/internal.py
+0
-6
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+282
-112
No files found.
python/dgl/graphbolt/impl/ondisk_dataset.py
View file @
14f396d0
...
@@ -15,7 +15,7 @@ import dgl
...
@@ -15,7 +15,7 @@ import dgl
from
..dataset
import
Dataset
from
..dataset
import
Dataset
from
..itemset
import
ItemSet
,
ItemSetDict
from
..itemset
import
ItemSet
,
ItemSetDict
from
..utils
import
read_data
,
save_data
,
tensor_to_tuple
from
..utils
import
read_data
,
save_data
from
.csc_sampling_graph
import
(
from
.csc_sampling_graph
import
(
CSCSamplingGraph
,
CSCSamplingGraph
,
...
@@ -173,33 +173,35 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
...
@@ -173,33 +173,35 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
):
):
for
input_set_per_type
,
output_set_per_type
in
zip
(
for
input_set_per_type
,
output_set_per_type
in
zip
(
intput_set_split
,
output_set_split
intput_set_split
,
output_set_split
):
for
input_data
,
output_data
in
zip
(
input_set_per_type
[
"data"
],
output_set_per_type
[
"data"
]
):
):
# Always save the feature in numpy format.
# Always save the feature in numpy format.
output_
set_per_type
[
"format"
]
=
"numpy"
output_
data
[
"format"
]
=
"numpy"
output_
set_per_type
[
"path"
]
=
str
(
output_
data
[
"path"
]
=
str
(
processed_dir_prefix
processed_dir_prefix
/
input_
set_per_type
[
"path"
].
replace
(
"pt"
,
"npy"
)
/
input_
data
[
"path"
].
replace
(
"pt"
,
"npy"
)
)
)
if
input_
set_per_type
[
"format"
]
==
"numpy"
:
if
input_
data
[
"format"
]
==
"numpy"
:
# If the original format is numpy, just copy the file.
# If the original format is numpy, just copy the file.
os
.
makedirs
(
os
.
makedirs
(
dataset_path
dataset_path
/
os
.
path
.
dirname
(
output_data
[
"path"
]),
/
os
.
path
.
dirname
(
output_set_per_type
[
"path"
]),
exist_ok
=
True
,
exist_ok
=
True
,
)
)
shutil
.
copy
(
shutil
.
copy
(
dataset_path
/
input_
set_per_type
[
"path"
],
dataset_path
/
input_
data
[
"path"
],
dataset_path
/
output_
set_per_type
[
"path"
],
dataset_path
/
output_
data
[
"path"
],
)
)
else
:
else
:
# If the original format is not numpy, convert it to numpy.
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
input_set
=
read_data
(
dataset_path
/
input_
set_per_type
[
"path"
],
dataset_path
/
input_
data
[
"path"
],
input_
set_per_type
[
"format"
],
input_
data
[
"format"
],
)
)
save_data
(
save_data
(
input_set
,
input_set
,
dataset_path
/
output_
set_per_type
[
"path"
],
dataset_path
/
output_
data
[
"path"
],
output_set_per_type
[
"format"
],
output_set_per_type
[
"format"
],
)
)
...
@@ -245,17 +247,23 @@ class OnDiskDataset(Dataset):
...
@@ -245,17 +247,23 @@ class OnDiskDataset(Dataset):
path: edge_data/author-writes-paper-feat.npy
path: edge_data/author-writes-paper-feat.npy
train_sets:
train_sets:
- - type: paper # could be null for homogeneous graph.
- - type: paper # could be null for homogeneous graph.
format: numpy
data: # multiple data sources could be specified.
- format: numpy
in_memory: true # If not specified, default to true.
in_memory: true # If not specified, default to true.
path: set/paper-train.npy
path: set/paper-train-src.npy
- format: numpy
in_memory: false
path: set/paper-train-dst.npy
validation_sets:
validation_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
in_memory: true
in_memory: true
path: set/paper-validation.npy
path: set/paper-validation.npy
test_sets:
test_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
in_memory: true
in_memory: true
path: set/paper-test.npy
path: set/paper-test.npy
...
@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset):
...
@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset):
assert
(
assert
(
len
(
tvt_set
)
==
1
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type is not specified."
),
"Only one TVT set is allowed if type is not specified."
data
=
read_data
(
ret
.
append
(
tvt_set
[
0
].
path
,
tvt_set
[
0
].
format
,
tvt_set
[
0
].
in_memory
ItemSet
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt_set
[
0
].
data
)
)
)
)
ret
.
append
(
ItemSet
(
tensor_to_tuple
(
data
)))
else
:
else
:
data
=
{}
data
=
{}
for
tvt
in
tvt_set
:
for
tvt
in
tvt_set
:
data
[
tvt
.
type
]
=
ItemSet
(
data
[
tvt
.
type
]
=
ItemSet
(
tensor_to_tuple
(
tuple
(
read_data
(
tvt
.
path
,
tvt
.
format
,
tvt
.
in_memory
)
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt
.
data
)
)
)
)
ret
.
append
(
ItemSetDict
(
data
))
ret
.
append
(
ItemSetDict
(
data
))
...
...
python/dgl/graphbolt/impl/ondisk_metadata.py
View file @
14f396d0
...
@@ -8,6 +8,7 @@ import pydantic
...
@@ -8,6 +8,7 @@ import pydantic
__all__
=
[
__all__
=
[
"OnDiskFeatureDataFormat"
,
"OnDiskFeatureDataFormat"
,
"OnDiskTVTSetData"
,
"OnDiskTVTSet"
,
"OnDiskTVTSet"
,
"OnDiskFeatureDataDomain"
,
"OnDiskFeatureDataDomain"
,
"OnDiskFeatureData"
,
"OnDiskFeatureData"
,
...
@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum):
...
@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum):
NUMPY
=
"numpy"
NUMPY
=
"numpy"
class
OnDiskTVTSet
(
pydantic
.
BaseModel
):
class
OnDiskTVTSet
Data
(
pydantic
.
BaseModel
):
"""Train-Validation-Test set."""
"""Train-Validation-Test set
data
."""
type
:
Optional
[
str
]
=
None
format
:
OnDiskFeatureDataFormat
format
:
OnDiskFeatureDataFormat
in_memory
:
Optional
[
bool
]
=
True
in_memory
:
Optional
[
bool
]
=
True
path
:
str
path
:
str
class
OnDiskTVTSet
(
pydantic
.
BaseModel
):
"""Train-Validation-Test set."""
type
:
Optional
[
str
]
=
None
data
:
List
[
OnDiskTVTSetData
]
class
OnDiskFeatureDataDomain
(
str
,
Enum
):
class
OnDiskFeatureDataDomain
(
str
,
Enum
):
"""Enum of feature data domain."""
"""Enum of feature data domain."""
...
...
python/dgl/graphbolt/utils/internal.py
View file @
14f396d0
...
@@ -45,9 +45,3 @@ def save_data(data, path, fmt):
...
@@ -45,9 +45,3 @@ def save_data(data, path, fmt):
np
.
save
(
path
,
data
)
np
.
save
(
path
,
data
)
elif
fmt
==
"torch"
:
elif
fmt
==
"torch"
:
torch
.
save
(
data
,
path
)
torch
.
save
(
data
,
path
)
def
tensor_to_tuple
(
data
):
"""Split a torch.Tensor in column-wise to a tuple."""
assert
isinstance
(
data
,
torch
.
Tensor
),
"data must be a torch.Tensor"
return
tuple
(
data
.
t
())
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
View file @
14f396d0
...
@@ -22,7 +22,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
...
@@ -22,7 +22,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
yaml_content
=
"""
yaml_content
=
"""
train_sets:
train_sets:
- - type: paper
- - type: paper
format: torch_invalid
data:
- format: torch_invalid
path: set/paper-train.pt
path: set/paper-train.pt
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
...
@@ -35,10 +36,12 @@ def test_OnDiskDataset_TVTSet_exceptions():
...
@@ -35,10 +36,12 @@ def test_OnDiskDataset_TVTSet_exceptions():
yaml_content
=
"""
yaml_content
=
"""
train_sets:
train_sets:
- - type: null
- - type: null
format: numpy
data:
- format: numpy
path: set/train.npy
path: set/train.npy
- type: null
- type: null
format: numpy
data:
- format: numpy
path: set/train.npy
path: set/train.npy
"""
"""
with
open
(
yaml_file
,
"w"
)
as
f
:
with
open
(
yaml_file
,
"w"
)
as
f
:
...
@@ -54,22 +57,25 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -54,22 +57,25 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
"""Test TVTSet which returns ItemSet with IDs and labels."""
"""Test TVTSet which returns ItemSet with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_ids
=
np
.
arange
(
1000
)
train_ids
=
np
.
arange
(
1000
)
train_ids_path
=
os
.
path
.
join
(
test_dir
,
"train_ids.npy"
)
np
.
save
(
train_ids_path
,
train_ids
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_ids
,
train_labels
]).
T
train_labels_path
=
os
.
path
.
join
(
test_dir
,
"train_labels.npy"
)
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_labels_path
,
train_labels
)
np
.
save
(
train_path
,
train_data
)
validation_ids
=
np
.
arange
(
1000
,
2000
)
validation_ids
=
np
.
arange
(
1000
,
2000
)
validation_ids_path
=
os
.
path
.
join
(
test_dir
,
"validation_ids.npy"
)
np
.
save
(
validation_ids_path
,
validation_ids
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_ids
,
validation_labels
]).
T
validation_labels_path
=
os
.
path
.
join
(
test_dir
,
"validation_labels.npy"
)
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_labels_path
,
validation_labels
)
np
.
save
(
validation_path
,
validation_data
)
test_ids
=
np
.
arange
(
2000
,
3000
)
test_ids
=
np
.
arange
(
2000
,
3000
)
test_ids_path
=
os
.
path
.
join
(
test_dir
,
"test_ids.npy"
)
np
.
save
(
test_ids_path
,
test_ids
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_ids
,
test_labels
]).
T
test_labels_path
=
os
.
path
.
join
(
test_dir
,
"test_labels.npy"
)
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_labels_path
,
test_labels
)
np
.
save
(
test_path
,
test_data
)
# Case 1:
# Case 1:
# all TVT sets are specified.
# all TVT sets are specified.
...
@@ -78,26 +84,30 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -78,26 +84,30 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
yaml_content
=
f
"""
yaml_content
=
f
"""
train_sets:
train_sets:
- - type: null
- - type: null
format: numpy
data:
- format: numpy
in_memory: true
in_memory: true
path:
{
train_path
}
path:
{
train_
ids_
path
}
- - type
: nu
ll
- format
: nu
mpy
format: numpy
in_memory: true
path:
{
train_path
}
path:
{
train_
labels_
path
}
validation_sets:
validation_sets:
- - format: numpy
- - data:
path:
{
validation_path
}
- format: numpy
- - type: null
in_memory: true
format: numpy
path:
{
validation_ids_path
}
path:
{
validation_path
}
- format: numpy
in_memory: true
path:
{
validation_labels_path
}
test_sets:
test_sets:
- - type: null
- - type: null
format: numpy
data:
in_memory: false
- format: numpy
path:
{
test_path
}
in_memory: true
- - type: null
path:
{
test_ids_path
}
format: numpy
- format: numpy
path:
{
test_path
}
in_memory: true
path:
{
test_labels_path
}
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
with
open
(
yaml_file
,
"w"
)
as
f
:
...
@@ -107,7 +117,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -107,7 +117,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_sets
=
dataset
.
train_sets
assert
len
(
train_sets
)
==
2
assert
len
(
train_sets
)
==
1
for
train_set
in
train_sets
:
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
...
@@ -118,7 +128,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -118,7 +128,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_sets
=
dataset
.
validation_sets
assert
len
(
validation_sets
)
==
2
assert
len
(
validation_sets
)
==
1
for
validation_set
in
validation_sets
:
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
...
@@ -129,7 +139,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -129,7 +139,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_sets
=
dataset
.
test_sets
assert
len
(
test_sets
)
==
2
assert
len
(
test_sets
)
==
1
for
test_set
in
test_sets
:
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
...
@@ -143,8 +153,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -143,8 +153,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
yaml_content
=
f
"""
yaml_content
=
f
"""
train_sets:
train_sets:
- - type: null
- - type: null
format: numpy
data:
path:
{
train_path
}
- format: numpy
path:
{
train_ids_path
}
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
with
open
(
yaml_file
,
"w"
)
as
f
:
...
@@ -160,47 +171,72 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
...
@@ -160,47 +171,72 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
def
test_OnDiskDataset_TVTSet_ItemSet_node_pair_label
():
def
test_OnDiskDataset_TVTSet_ItemSet_node_pair_label
():
"""Test TVTSet which returns ItemSet with IDs and labels."""
"""Test TVTSet which returns ItemSet with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_pairs
=
(
np
.
arange
(
1000
),
np
.
arange
(
1000
,
2000
))
train_src
=
np
.
arange
(
1000
)
train_src_path
=
os
.
path
.
join
(
test_dir
,
"train_src.npy"
)
np
.
save
(
train_src_path
,
train_src
)
train_dst
=
np
.
arange
(
1000
,
2000
)
train_dst_path
=
os
.
path
.
join
(
test_dir
,
"train_dst.npy"
)
np
.
save
(
train_dst_path
,
train_dst
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
train_data
=
np
.
vstack
([
train_pairs
,
train_labels
]).
T
train_labels_path
=
os
.
path
.
join
(
test_dir
,
"train_labels.npy"
)
train_path
=
os
.
path
.
join
(
test_dir
,
"train.npy"
)
np
.
save
(
train_labels_path
,
train_labels
)
np
.
save
(
train_path
,
train_data
)
validation_src
=
np
.
arange
(
1000
,
2000
)
validation_pairs
=
(
np
.
arange
(
1000
,
2000
),
np
.
arange
(
2000
,
3000
))
validation_src_path
=
os
.
path
.
join
(
test_dir
,
"validation_src.npy"
)
np
.
save
(
validation_src_path
,
validation_src
)
validation_dst
=
np
.
arange
(
2000
,
3000
)
validation_dst_path
=
os
.
path
.
join
(
test_dir
,
"validation_dst.npy"
)
np
.
save
(
validation_dst_path
,
validation_dst
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
validation_data
=
np
.
vstack
([
validation_pairs
,
validation_labels
]).
T
validation_labels_path
=
os
.
path
.
join
(
test_dir
,
"validation_labels.npy"
)
validation_path
=
os
.
path
.
join
(
test_dir
,
"validation.npy"
)
np
.
save
(
validation_labels_path
,
validation_labels
)
np
.
save
(
validation_path
,
validation_data
)
test_src
=
np
.
arange
(
2000
,
3000
)
test_pairs
=
(
np
.
arange
(
2000
,
3000
),
np
.
arange
(
3000
,
4000
))
test_src_path
=
os
.
path
.
join
(
test_dir
,
"test_src.npy"
)
np
.
save
(
test_src_path
,
test_src
)
test_dst
=
np
.
arange
(
3000
,
4000
)
test_dst_path
=
os
.
path
.
join
(
test_dir
,
"test_dst.npy"
)
np
.
save
(
test_dst_path
,
test_dst
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_labels
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
test_data
=
np
.
vstack
([
test_pairs
,
test_labels
]).
T
test_labels_path
=
os
.
path
.
join
(
test_dir
,
"test_labels.npy"
)
test_path
=
os
.
path
.
join
(
test_dir
,
"test.npy"
)
np
.
save
(
test_labels_path
,
test_labels
)
np
.
save
(
test_path
,
test_data
)
yaml_content
=
f
"""
yaml_content
=
f
"""
train_sets:
train_sets:
- - type: null
- - type: null
format: numpy
data:
- format: numpy
in_memory: true
in_memory: true
path:
{
train_path
}
path:
{
train_src_path
}
- - type: null
- format: numpy
format: numpy
in_memory: true
path:
{
train_path
}
path:
{
train_dst_path
}
- format: numpy
in_memory: true
path:
{
train_labels_path
}
validation_sets:
validation_sets:
- - format: numpy
- - data:
path:
{
validation_path
}
- format: numpy
- - type: null
in_memory: true
format: numpy
path:
{
validation_src_path
}
path:
{
validation_path
}
- format: numpy
in_memory: true
path:
{
validation_dst_path
}
- format: numpy
in_memory: true
path:
{
validation_labels_path
}
test_sets:
test_sets:
- - type: null
- - type: null
format: numpy
data:
in_memory: false
- format: numpy
path:
{
test_path
}
in_memory: true
- - type: null
path:
{
test_src_path
}
format: numpy
- format: numpy
path:
{
test_path
}
in_memory: true
path:
{
test_dst_path
}
- format: numpy
in_memory: true
path:
{
test_labels_path
}
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
with
open
(
yaml_file
,
"w"
)
as
f
:
...
@@ -210,42 +246,162 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
...
@@ -210,42 +246,162 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
# Verify train set.
# Verify train set.
train_sets
=
dataset
.
train_sets
train_sets
=
dataset
.
train_sets
assert
len
(
train_sets
)
==
2
assert
len
(
train_sets
)
==
1
for
train_set
in
train_sets
:
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
train_set
):
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
train_set
):
assert
src
==
train_
pairs
[
0
]
[
i
]
assert
src
==
train_
src
[
i
]
assert
dst
==
train_
pairs
[
1
]
[
i
]
assert
dst
==
train_
dst
[
i
]
assert
label
==
train_labels
[
i
]
assert
label
==
train_labels
[
i
]
train_sets
=
None
train_sets
=
None
# Verify validation set.
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
validation_sets
=
dataset
.
validation_sets
assert
len
(
validation_sets
)
==
2
assert
len
(
validation_sets
)
==
1
for
validation_set
in
validation_sets
:
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
validation_set
):
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
validation_set
):
assert
src
==
validation_
pairs
[
0
]
[
i
]
assert
src
==
validation_
src
[
i
]
assert
dst
==
validation_
pairs
[
1
]
[
i
]
assert
dst
==
validation_
dst
[
i
]
assert
label
==
validation_labels
[
i
]
assert
label
==
validation_labels
[
i
]
validation_sets
=
None
validation_sets
=
None
# Verify test set.
# Verify test set.
test_sets
=
dataset
.
test_sets
test_sets
=
dataset
.
test_sets
assert
len
(
test_sets
)
==
2
assert
len
(
test_sets
)
==
1
for
test_set
in
test_sets
:
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
test_set
):
for
i
,
(
src
,
dst
,
label
)
in
enumerate
(
test_set
):
assert
src
==
test_
pairs
[
0
]
[
i
]
assert
src
==
test_
src
[
i
]
assert
dst
==
test_
pairs
[
1
]
[
i
]
assert
dst
==
test_
dst
[
i
]
assert
label
==
test_labels
[
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
test_sets
=
None
dataset
=
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs
():
"""Test TVTSet which returns ItemSet with node pairs and negative ones."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
train_src
=
np
.
arange
(
1000
)
train_src_path
=
os
.
path
.
join
(
test_dir
,
"train_src.npy"
)
np
.
save
(
train_src_path
,
train_src
)
train_dst
=
np
.
arange
(
1000
,
2000
)
train_dst_path
=
os
.
path
.
join
(
test_dir
,
"train_dst.npy"
)
np
.
save
(
train_dst_path
,
train_dst
)
train_neg_dst
=
np
.
random
.
choice
(
1000
*
10
,
size
=
1000
*
10
).
reshape
(
1000
,
10
)
train_neg_dst_path
=
os
.
path
.
join
(
test_dir
,
"train_neg_dst.npy"
)
np
.
save
(
train_neg_dst_path
,
train_neg_dst
)
validation_src
=
np
.
arange
(
1000
,
2000
)
validation_src_path
=
os
.
path
.
join
(
test_dir
,
"validation_src.npy"
)
np
.
save
(
validation_src_path
,
validation_src
)
validation_dst
=
np
.
arange
(
2000
,
3000
)
validation_dst_path
=
os
.
path
.
join
(
test_dir
,
"validation_dst.npy"
)
np
.
save
(
validation_dst_path
,
validation_dst
)
validation_neg_dst
=
train_neg_dst
+
1
validation_neg_dst_path
=
os
.
path
.
join
(
test_dir
,
"validation_neg_dst.npy"
)
np
.
save
(
validation_neg_dst_path
,
validation_neg_dst
)
test_src
=
np
.
arange
(
2000
,
3000
)
test_src_path
=
os
.
path
.
join
(
test_dir
,
"test_src.npy"
)
np
.
save
(
test_src_path
,
test_src
)
test_dst
=
np
.
arange
(
3000
,
4000
)
test_dst_path
=
os
.
path
.
join
(
test_dir
,
"test_dst.npy"
)
np
.
save
(
test_dst_path
,
test_dst
)
test_neg_dst
=
train_neg_dst
+
2
test_neg_dst_path
=
os
.
path
.
join
(
test_dir
,
"test_neg_dst.npy"
)
np
.
save
(
test_neg_dst_path
,
test_neg_dst
)
yaml_content
=
f
"""
train_sets:
- - type: null
data:
- format: numpy
in_memory: true
path:
{
train_src_path
}
- format: numpy
in_memory: true
path:
{
train_dst_path
}
- format: numpy
in_memory: true
path:
{
train_neg_dst_path
}
validation_sets:
- - data:
- format: numpy
in_memory: true
path:
{
validation_src_path
}
- format: numpy
in_memory: true
path:
{
validation_dst_path
}
- format: numpy
in_memory: true
path:
{
validation_neg_dst_path
}
test_sets:
- - type: null
data:
- format: numpy
in_memory: true
path:
{
test_src_path
}
- format: numpy
in_memory: true
path:
{
test_dst_path
}
- format: numpy
in_memory: true
path:
{
test_neg_dst_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify train set.
train_sets
=
dataset
.
train_sets
assert
len
(
train_sets
)
==
1
for
train_set
in
train_sets
:
assert
len
(
train_set
)
==
1000
assert
isinstance
(
train_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
train_set
):
assert
src
==
train_src
[
i
]
assert
dst
==
train_dst
[
i
]
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
train_neg_dst
[
i
]))
train_sets
=
None
# Verify validation set.
validation_sets
=
dataset
.
validation_sets
assert
len
(
validation_sets
)
==
1
for
validation_set
in
validation_sets
:
assert
len
(
validation_set
)
==
1000
assert
isinstance
(
validation_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
validation_set
):
assert
src
==
validation_src
[
i
]
assert
dst
==
validation_dst
[
i
]
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
validation_neg_dst
[
i
])
)
validation_sets
=
None
# Verify test set.
test_sets
=
dataset
.
test_sets
assert
len
(
test_sets
)
==
1
for
test_set
in
test_sets
:
assert
len
(
test_set
)
==
1000
assert
isinstance
(
test_set
,
gb
.
ItemSet
)
for
i
,
(
src
,
dst
,
negs
)
in
enumerate
(
test_set
):
assert
src
==
test_src
[
i
]
assert
dst
==
test_dst
[
i
]
assert
torch
.
equal
(
negs
,
torch
.
from_numpy
(
test_neg_dst
[
i
]))
test_sets
=
None
dataset
=
None
def
test_OnDiskDataset_TVTSet_ItemSetDict_id_label
():
def
test_OnDiskDataset_TVTSet_ItemSetDict_id_label
():
"""Test TVTSet which returns ItemSetDict with IDs and labels."""
"""Test TVTSet which returns ItemSetDict with IDs and labels."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
...
@@ -270,26 +426,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
...
@@ -270,26 +426,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
yaml_content
=
f
"""
yaml_content
=
f
"""
train_sets:
train_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
in_memory: true
in_memory: true
path:
{
train_path
}
path:
{
train_path
}
- - type: author
- - type: author
format: numpy
data:
- format: numpy
path:
{
train_path
}
path:
{
train_path
}
validation_sets:
validation_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
- - type: author
- - type: author
format: numpy
data:
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
test_sets:
test_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
in_memory: false
in_memory: false
path:
{
test_path
}
path:
{
test_path
}
- - type: author
- - type: author
format: numpy
data:
- format: numpy
path:
{
test_path
}
path:
{
test_path
}
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
...
@@ -372,26 +534,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
...
@@ -372,26 +534,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
yaml_content
=
f
"""
yaml_content
=
f
"""
train_sets:
train_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
in_memory: true
in_memory: true
path:
{
train_path
}
path:
{
train_path
}
- - type: author
- - type: author
format: numpy
data:
- format: numpy
path:
{
train_path
}
path:
{
train_path
}
validation_sets:
validation_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
- - type: author
- - type: author
format: numpy
data:
- format: numpy
path:
{
validation_path
}
path:
{
validation_path
}
test_sets:
test_sets:
- - type: paper
- - type: paper
format: numpy
data:
- format: numpy
in_memory: false
in_memory: false
path:
{
test_path
}
path:
{
test_path
}
- - type: author
- - type: author
format: numpy
data:
- format: numpy
path:
{
test_path
}
path:
{
test_path
}
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
...
@@ -829,16 +997,18 @@ def test_OnDiskDataset_preprocess_homogeneous():
...
@@ -829,16 +997,18 @@ def test_OnDiskDataset_preprocess_homogeneous():
path: data/node-feat.npy
path: data/node-feat.npy
train_sets:
train_sets:
- - type_name: null
- - type_name: null
# shape: (num_trains, 3), 3 for (src, dst, label).
data:
format: numpy
-
format: numpy
path: set/train.npy
path: set/train.npy
validation_sets:
validation_sets:
- - type_name: null
- - type_name: null
format: numpy
data:
- format: numpy
path: set/validation.npy
path: set/validation.npy
test_sets:
test_sets:
- - type_name: null
- - type_name: null
format: numpy
data:
- format: numpy
path: set/test.npy
path: set/test.npy
"""
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment