Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
47d37e91
Unverified
Commit
47d37e91
authored
Aug 02, 2023
by
Rhett Ying
Committed by
GitHub
Aug 02, 2023
Browse files
[GraphBolt] convert TVT from list of list to list (#6080)
parent
12ade95c
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
435 additions
and
481 deletions
+435
-481
python/dgl/graphbolt/dataset.py
python/dgl/graphbolt/dataset.py
+7
-7
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+85
-93
python/dgl/graphbolt/impl/ondisk_metadata.py
python/dgl/graphbolt/impl/ondisk_metadata.py
+3
-3
tests/python/pytorch/graphbolt/test_dataset.py
tests/python/pytorch/graphbolt/test_dataset.py
+5
-11
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+335
-367
No files found.
python/dgl/graphbolt/dataset.py
View file @
47d37e91
"""GraphBolt Dataset."""
"""GraphBolt Dataset."""
from
typing
import
Dict
,
List
from
typing
import
Dict
from
.feature_store
import
FeatureStore
from
.feature_store
import
FeatureStore
from
.itemset
import
ItemSet
,
ItemSetDict
from
.itemset
import
ItemSet
,
ItemSetDict
...
@@ -32,18 +32,18 @@ class Dataset:
...
@@ -32,18 +32,18 @@ class Dataset:
"""
"""
@
property
@
property
def
train_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
train_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the training set
s
."""
"""Return the training set."""
raise
NotImplementedError
raise
NotImplementedError
@
property
@
property
def
validation_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
validation_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the validation set
s
."""
"""Return the validation set."""
raise
NotImplementedError
raise
NotImplementedError
@
property
@
property
def
test_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
test_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the test set
s
."""
"""Return the test set."""
raise
NotImplementedError
raise
NotImplementedError
@
property
@
property
...
...
python/dgl/graphbolt/impl/ondisk_dataset.py
View file @
47d37e91
...
@@ -165,45 +165,42 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
...
@@ -165,45 +165,42 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
)
)
# 7. Save the train/val/test split according to the output_config.
# 7. Save the train/val/test split according to the output_config.
for
set_name
in
[
"train_set
s
"
,
"validation_set
s
"
,
"test_set
s
"
]:
for
set_name
in
[
"train_set"
,
"validation_set"
,
"test_set"
]:
if
set_name
not
in
input_config
:
if
set_name
not
in
input_config
:
continue
continue
for
in
t
put_set_
split
,
output_set_
split
in
zip
(
for
input_set_
per_type
,
output_set_
per_type
in
zip
(
input_config
[
set_name
],
output_config
[
set_name
]
input_config
[
set_name
],
output_config
[
set_name
]
):
):
for
input_
set_per_type
,
output_set_per_type
in
zip
(
for
input_
data
,
output_data
in
zip
(
in
t
put_set_
split
,
output_set_split
input_set_
per_type
[
"data"
],
output_set_per_type
[
"data"
]
):
):
for
input_data
,
output_data
in
zip
(
# Always save the feature in numpy format.
input_set_per_type
[
"data"
],
output_set_per_type
[
"data"
]
output_data
[
"format"
]
=
"numpy"
):
output_data
[
"path"
]
=
str
(
# Always save the feature in numpy format.
processed_dir_prefix
output_data
[
"format"
]
=
"numpy"
/
input_data
[
"path"
].
replace
(
"pt"
,
"npy"
)
output_data
[
"path"
]
=
str
(
)
processed_dir_prefix
if
input_data
[
"format"
]
==
"numpy"
:
/
input_data
[
"path"
].
replace
(
"pt"
,
"npy"
)
# If the original format is numpy, just copy the file.
os
.
makedirs
(
dataset_path
/
os
.
path
.
dirname
(
output_data
[
"path"
]),
exist_ok
=
True
,
)
shutil
.
copy
(
dataset_path
/
input_data
[
"path"
],
dataset_path
/
output_data
[
"path"
],
)
else
:
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
dataset_path
/
input_data
[
"path"
],
input_data
[
"format"
],
)
save_data
(
input_set
,
dataset_path
/
output_data
[
"path"
],
output_set_per_type
[
"format"
],
)
)
if
input_data
[
"format"
]
==
"numpy"
:
# If the original format is numpy, just copy the file.
os
.
makedirs
(
dataset_path
/
os
.
path
.
dirname
(
output_data
[
"path"
]),
exist_ok
=
True
,
)
shutil
.
copy
(
dataset_path
/
input_data
[
"path"
],
dataset_path
/
output_data
[
"path"
],
)
else
:
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
dataset_path
/
input_data
[
"path"
],
input_data
[
"format"
],
)
save_data
(
input_set
,
dataset_path
/
output_data
[
"path"
],
output_set_per_type
[
"format"
],
)
# 8. Save the output_config.
# 8. Save the output_config.
output_config_path
=
dataset_path
/
"output_config.yaml"
output_config_path
=
dataset_path
/
"output_config.yaml"
...
@@ -245,27 +242,27 @@ class OnDiskDataset(Dataset):
...
@@ -245,27 +242,27 @@ class OnDiskDataset(Dataset):
format: numpy
format: numpy
in_memory: false
in_memory: false
path: edge_data/author-writes-paper-feat.npy
path: edge_data/author-writes-paper-feat.npy
train_set
s
:
train_set:
-
-
type: paper # could be null for homogeneous graph.
- type: paper # could be null for homogeneous graph.
data: # multiple data sources could be specified.
data: # multiple data sources could be specified.
- format: numpy
- format: numpy
in_memory: true # If not specified, default to true.
in_memory: true # If not specified, default to true.
path: set/paper-train-src.npy
path: set/paper-train-src.npy
- format: numpy
- format: numpy
in_memory: false
in_memory: false
path: set/paper-train-dst.npy
path: set/paper-train-dst.npy
validation_set
s
:
validation_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path: set/paper-validation.npy
path: set/paper-validation.npy
test_set
s
:
test_set:
-
-
type: paper
- type: paper
data:
data:
- format: numpy
- format: numpy
in_memory: true
in_memory: true
path: set/paper-test.npy
path: set/paper-test.npy
Parameters
Parameters
----------
----------
...
@@ -285,24 +282,24 @@ class OnDiskDataset(Dataset):
...
@@ -285,24 +282,24 @@ class OnDiskDataset(Dataset):
self
.
_num_labels
=
self
.
_meta
.
num_labels
self
.
_num_labels
=
self
.
_meta
.
num_labels
self
.
_graph
=
self
.
_load_graph
(
self
.
_meta
.
graph_topology
)
self
.
_graph
=
self
.
_load_graph
(
self
.
_meta
.
graph_topology
)
self
.
_feature
=
load_feature_stores
(
self
.
_meta
.
feature_data
)
self
.
_feature
=
load_feature_stores
(
self
.
_meta
.
feature_data
)
self
.
_train_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
train_set
s
)
self
.
_train_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
train_set
)
self
.
_validation_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
validation_set
s
)
self
.
_validation_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
validation_set
)
self
.
_test_set
s
=
self
.
_init_tvt_set
s
(
self
.
_meta
.
test_set
s
)
self
.
_test_set
=
self
.
_init_tvt_set
(
self
.
_meta
.
test_set
)
@
property
@
property
def
train_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
train_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the training set."""
"""Return the training set."""
return
self
.
_train_set
s
return
self
.
_train_set
@
property
@
property
def
validation_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
validation_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the validation set."""
"""Return the validation set."""
return
self
.
_validation_set
s
return
self
.
_validation_set
@
property
@
property
def
test_set
s
(
self
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]
:
def
test_set
(
self
)
->
ItemSet
or
ItemSetDict
:
"""Return the test set."""
"""Return the test set."""
return
self
.
_test_set
s
return
self
.
_test_set
@
property
@
property
def
graph
(
self
)
->
object
:
def
graph
(
self
)
->
object
:
...
@@ -341,36 +338,31 @@ class OnDiskDataset(Dataset):
...
@@ -341,36 +338,31 @@ class OnDiskDataset(Dataset):
f
"Graph topology type
{
graph_topology
.
type
}
is not supported."
f
"Graph topology type
{
graph_topology
.
type
}
is not supported."
)
)
def
_init_tvt_sets
(
def
_init_tvt_set
(
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
self
,
tvt_set
:
List
[
OnDiskTVTSet
]
)
->
List
[
ItemSet
]
or
List
[
ItemSetDict
]:
)
->
ItemSet
or
ItemSetDict
:
"""Initialize the TVT sets."""
"""Initialize the TVT set."""
if
(
tvt_sets
is
None
)
or
(
len
(
tvt_sets
)
==
0
):
ret
=
None
return
None
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
ret
=
[]
return
ret
for
tvt_set
in
tvt_sets
:
if
tvt_set
[
0
].
type
is
None
:
if
(
tvt_set
is
None
)
or
(
len
(
tvt_set
)
==
0
):
assert
(
ret
.
append
(
None
)
len
(
tvt_set
)
==
1
if
tvt_set
[
0
].
type
is
None
:
),
"Only one TVT set is allowed if type is not specified."
assert
(
ret
=
ItemSet
(
len
(
tvt_set
)
==
1
tuple
(
),
"Only one TVT set is allowed if type is not specified."
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
ret
.
append
(
for
data
in
tvt_set
[
0
].
data
ItemSet
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt_set
[
0
].
data
)
)
)
)
else
:
)
data
=
{}
else
:
for
tvt
in
tvt_set
:
data
=
{}
data
[
tvt
.
type
]
=
ItemS
et
(
for
tvt
in
tvt_s
et
:
tuple
(
data
[
tvt
.
type
]
=
ItemSet
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
tuple
(
for
data
in
tvt
.
data
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
)
for
data
in
tvt
.
data
)
)
ret
.
append
(
ItemSetDict
(
data
))
)
ret
=
ItemSetDict
(
data
)
return
ret
return
ret
python/dgl/graphbolt/impl/ondisk_metadata.py
View file @
47d37e91
...
@@ -83,6 +83,6 @@ class OnDiskMetaData(pydantic.BaseModel):
...
@@ -83,6 +83,6 @@ class OnDiskMetaData(pydantic.BaseModel):
num_labels
:
Optional
[
int
]
=
None
num_labels
:
Optional
[
int
]
=
None
graph_topology
:
Optional
[
OnDiskGraphTopology
]
=
None
graph_topology
:
Optional
[
OnDiskGraphTopology
]
=
None
feature_data
:
Optional
[
List
[
OnDiskFeatureData
]]
=
[]
feature_data
:
Optional
[
List
[
OnDiskFeatureData
]]
=
[]
train_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
train_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
validation_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
validation_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
test_set
s
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]
]
=
[]
test_set
:
Optional
[
List
[
OnDiskTVTSet
]]
=
[]
tests/python/pytorch/graphbolt/test_dataset.py
View file @
47d37e91
import
os
import
tempfile
import
numpy
as
np
import
pydantic
import
pytest
import
pytest
from
dgl
import
graphbolt
as
gb
from
dgl
import
graphbolt
as
gb
...
@@ -11,15 +5,15 @@ from dgl import graphbolt as gb
...
@@ -11,15 +5,15 @@ from dgl import graphbolt as gb
def
test_Dataset
():
def
test_Dataset
():
dataset
=
gb
.
Dataset
()
dataset
=
gb
.
Dataset
()
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
train_set
s
()
_
=
dataset
.
train_set
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
validation_set
s
()
_
=
dataset
.
validation_set
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
test_set
s
()
_
=
dataset
.
test_set
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
graph
()
_
=
dataset
.
graph
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
feature
()
_
=
dataset
.
feature
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
_
=
dataset
.
dataset_name
_
=
dataset
.
dataset_name
with
pytest
.
raises
(
NotImplementedError
):
with
pytest
.
raises
(
NotImplementedError
):
...
...
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
View file @
47d37e91
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment