Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
47d37e91
Unverified
Commit
47d37e91
authored
Aug 02, 2023
by
Rhett Ying
Committed by
GitHub
Aug 02, 2023
Browse files
[GraphBolt] convert TVT from list of list to list (#6080)
parent
12ade95c
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
435 additions
and
481 deletions
+435
-481
python/dgl/graphbolt/dataset.py
python/dgl/graphbolt/dataset.py
+7
-7
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+85
-93
python/dgl/graphbolt/impl/ondisk_metadata.py
python/dgl/graphbolt/impl/ondisk_metadata.py
+3
-3
tests/python/pytorch/graphbolt/test_dataset.py
tests/python/pytorch/graphbolt/test_dataset.py
+5
-11
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+335
-367
No files found.
python/dgl/graphbolt/dataset.py
View file @
47d37e91
"""GraphBolt Dataset."""
from
typing
import
Dict
,
List
from
typing
import
Dict
from
.feature_store
import
FeatureStore
from
.itemset
import
ItemSet
,
ItemSetDict
...
...
@@ -32,18 +32,18 @@ class Dataset:
"""
@
property
def
train_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
"""Return the training set
s
."""
def
train_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the training set."""
raise
NotImplementedError
@
property
def
validation_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
"""Return the validation set
s
."""
def
validation_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the validation set."""
raise
NotImplementedError
@
property
def
test_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
"""Return the test set
s
."""
def
test_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the test set."""
raise
NotImplementedError
@
property
...
...
python/dgl/graphbolt/impl/ondisk_dataset.py
View file @
47d37e91
...
...
@@ -165,45 +165,42 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
)
# 7. Save the train/val/test split according to the output_config.
for
set_name
in
[
"train_set
s
"
,
"validation_set
s
"
,
"test_set
s
"
]:
for
set_name
in
[
"train_set"
,
"validation_set"
,
"test_set"
]:
if
set_name
not
in
input_config
:
continue
for
in
t
put_set_
split
,
output_set_
split
in
zip
(
for
input_set_
per_type
,
output_set_
per_type
in
zip
(
input_config
[
set_name
],
output_config
[
set_name
]
):
for
input_
set_per_type
,
output_set_per_type
in
zip
(
in
t
put_set_
split
,
output_set_split
for
input_
data
,
output_data
in
zip
(
input_set_
per_type
[
"data"
],
output_set_per_type
[
"data"
]
):
for
input_data
,
output_data
in
zip
(
input_set_per_type
[
"data"
],
output_set_per_type
[
"data"
]
):
# Always save the feature in numpy format.
output_data
[
"format"
]
=
"numpy"
output_data
[
"path"
]
=
str
(
processed_dir_prefix
/
input_data
[
"path"
].
replace
(
"pt"
,
"npy"
)
# Always save the feature in numpy format.
output_data
[
"format"
]
=
"numpy"
output_data
[
"path"
]
=
str
(
processed_dir_prefix
/
input_data
[
"path"
].
replace
(
"pt"
,
"npy"
)
)
if
input_data
[
"format"
]
==
"numpy"
:
# If the original format is numpy, just copy the file.
os
.
makedirs
(
dataset_path
/
os
.
path
.
dirname
(
output_data
[
"path"
]),
exist_ok
=
True
,
)
shutil
.
copy
(
dataset_path
/
input_data
[
"path"
],
dataset_path
/
output_data
[
"path"
],
)
else
:
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
dataset_path
/
input_data
[
"path"
],
input_data
[
"format"
],
)
save_data
(
input_set
,
dataset_path
/
output_data
[
"path"
],
output_set_per_type
[
"format"
],
)
if
input_data
[
"format"
]
==
"numpy"
:
# If the original format is numpy, just copy the file.
os
.
makedirs
(
dataset_path
/
os
.
path
.
dirname
(
output_data
[
"path"
]),
exist_ok
=
True
,
)
shutil
.
copy
(
dataset_path
/
input_data
[
"path"
],
dataset_path
/
output_data
[
"path"
],
)
else
:
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
dataset_path
/
input_data
[
"path"
],
input_data
[
"format"
],
)
save_data
(
input_set
,
dataset_path
/
output_data
[
"path"
],
output_set_per_type
[
"format"
],
)
# 8. Save the output_config.
output_config_path
=
dataset_path
/
"output_config.yaml"
...
...
@@ -245,27 +242,27 @@ class OnDiskDataset(Dataset):
format: numpy
in_memory: false
path: edge_data/author-writes-paper-feat.npy
train_set
s
:
-
-
type: paper # could be null for homogeneous graph.
data: # multiple data sources could be specified.
- format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train-src.npy
- format: numpy
in_memory: false
path: set/paper-train-dst.npy
validation_set
s
:
-
-
type: paper
data:
- format: numpy
in_memory: true
path: set/paper-validation.npy
test_set
s
:
-
-
type: paper
data:
- format: numpy
in_memory: true
path: set/paper-test.npy
train_set:
- type: paper # could be null for homogeneous graph.
data: # multiple data sources could be specified.
- format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train-src.npy
- format: numpy
in_memory: false
path: set/paper-train-dst.npy
validation_set:
- type: paper
data:
- format: numpy
in_memory: true
path: set/paper-validation.npy
test_set:
- type: paper
data:
- format: numpy
in_memory: true
path: set/paper-test.npy
Parameters
----------
...
...
@@ -285,24 +282,24 @@ class OnDiskDataset(Dataset):
self
.
_num_labels
=
self
.
_meta
.
num_labels
self
.
_graph
=
self
.
_load_graph
(
self
.
_meta
.
graph_topology
)
self
.
_feature
=
load_feature_stores
(
self
.
_meta
.
feature_data
)
self
.
_train_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
train_set
s
)
self
.
_validation_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
validation_set
s
)
self
.
_test_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
test_set
s
)
self
.
_train_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
train_set
)
self
.
_validation_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
validation_set
)
self
.
_test_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
test_set
)
@
property
def
train_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
train_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the training set."""
return
self
.
_train_set
s
return
self
.
_train_set
@
property
def
validation_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
validation_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the validation set."""
return
self
.
_validation_set
s
return
self
.
_validation_set
@
property
def
test_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
test_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the test set."""
return
self
.
_test_set
s
return
self
.
_test_set
@
property
def
graph
(
self
)
->
object
:
...
...
@@ -341,36 +338,31 @@ class OnDiskDataset(Dataset):
f
"Graph topology type
{
graph_topology
.
type
}
is not supported."
)
def
_init_tvt_sets
(
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
"""Initialize the TVT sets."""
if
(
tvt_sets
is
None
)
or
(
len
(
tvt_sets
)
==
0
):
return
None
ret
=
[]
for
tvt_set
in
tvt_sets
:
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
ret
.
append
(
None
)
if
tvt_set
[
0
].
type
is
None
:
assert
(
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type is not specified."
ret
.
append
(
ItemSet
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt_set
[
0
].
data
)
)
def
_init_tvt_set
(
self
,
tvt_set
:
List
[
OnDiskTVTSet
]
)
->
ItemSet
or
ItemSetDict
:
"""Initialize the TVT set."""
ret
=
None
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
return
ret
if
tvt_set
[
0
].
type
is
None
:
assert
(
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type is not specified."
ret
=
ItemSet
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt_set
[
0
].
data
)
else
:
data
=
{}
for
tvt
in
tvt_set
:
data
[
tvt
.
type
]
=
ItemS
et
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt
.
data
)
)
else
:
data
=
{}
for
tvt
in
tvt_s
et
:
data
[
tvt
.
type
]
=
ItemSet
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt
.
data
)
ret
.
append
(
ItemSetDict
(
data
))
)
ret
=
ItemSetDict
(
data
)
return
ret
python/dgl/graphbolt/impl/ondisk_metadata.py
View file @
47d37e91
...
...
@@ -83,6 +83,6 @@ class OnDiskMetaData(pydantic.BaseModel):
num_labels
:
Optional
[
int
]
=
None
graph_topology
:
Optional
[
OnDiskGraphTopology
]
=
None
feature_data
:
Optional
[
List
[
OnDiskFeatureData
]]
=
[]
train_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
validation_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
test_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
train_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
validation_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
test_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
tests/python/pytorch/graphbolt/test_dataset.py
View file @
47d37e91
import
os
import
tempfile
import
numpy
as
np
import
pydantic
import
pytest
from
dgl
import
graphbolt
as
gb
...
...
@@ -11,15 +5,15 @@ from dgl import graphbolt as gb
def
test_Dataset
():
dataset
=
gb
.
Dataset
()
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
train_set
s
()
_
=
dataset
.
train_set
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
validation_set
s
()
_
=
dataset
.
validation_set
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
test_set
s
()
_
=
dataset
.
test_set
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
graph
()
_
=
dataset
.
graph
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
feature
()
_
=
dataset
.
feature
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
dataset_name
with
pytest
.
raises
(
NotImplementedError
):
...
...
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
View file @
47d37e91
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment