Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
14f396d0
Unverified
Commit
14f396d0
authored
Aug 01, 2023
by
Rhett Ying
Committed by
GitHub
Aug 01, 2023
Browse files
[GraphBolt] change TVT format of OnDiskDataset (#6076)
parent
17f6c4c9
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
347 additions
and
163 deletions
+347
-163
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+55
-42
python/dgl/graphbolt/impl/ondisk_metadata.py
python/dgl/graphbolt/impl/ondisk_metadata.py
+10
-3
python/dgl/graphbolt/utils/internal.py
python/dgl/graphbolt/utils/internal.py
+0
-6
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+282
-112
No files found.
python/dgl/graphbolt/impl/ondisk_dataset.py
View file @
14f396d0
...
@@ -15,7 +15,7 @@ import dgl
...
@@ -15,7 +15,7 @@ import dgl
from
..dataset
import
Dataset
from
..dataset
import
Dataset
from
..itemset
import
ItemSet
,
ItemSetDict
from
..itemset
import
ItemSet
,
ItemSetDict
from
..utils
import
read_data
,
save_data
,
tensor_to_tuple
from
..utils
import
read_data
,
save_data
from
.csc_sampling_graph
import
(
from
.csc_sampling_graph
import
(
CSCSamplingGraph
,
CSCSamplingGraph
,
...
@@ -174,34 +174,36 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
...
@@ -174,34 +174,36 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
for
input_set_per_type
,
output_set_per_type
in
zip
(
for
input_set_per_type
,
output_set_per_type
in
zip
(
intput_set_split
,
output_set_split
intput_set_split
,
output_set_split
):
):
# Always save the feature in numpy format.
for
input_data
,
output_data
in
zip
(
output_set_per_type
[
"format"
]
=
"numpy"
input_set_per_type
[
"data"
],
output_set_per_type
[
"data"
]
output_set_per_type
[
"path"
]
=
str
(
):
processed_dir_prefix
# Always save the feature in numpy format.
/
input_set_per_type
[
"path"
].
replace
(
"pt"
,
"npy"
)
output_data
[
"format"
]
=
"numpy"
)
output_data
[
"path"
]
=
str
(
if
input_set_per_type
[
"format"
]
==
"numpy"
:
processed_dir_prefix
# If the original format is numpy, just copy the file.
/
input_data
[
"path"
].
replace
(
"pt"
,
"npy"
)
os
.
makedirs
(
dataset_path
/
os
.
path
.
dirname
(
output_set_per_type
[
"path"
]),
exist_ok
=
True
,
)
shutil
.
copy
(
dataset_path
/
input_set_per_type
[
"path"
],
dataset_path
/
output_set_per_type
[
"path"
],
)
else
:
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
dataset_path
/
input_set_per_type
[
"path"
],
input_set_per_type
[
"format"
],
)
save_data
(
input_set
,
dataset_path
/
output_set_per_type
[
"path"
],
output_set_per_type
[
"format"
],
)
)
if
input_data
[
"format"
]
==
"numpy"
:
# If the original format is numpy, just copy the file.
os
.
makedirs
(
dataset_path
/
os
.
path
.
dirname
(
output_data
[
"path"
]),
exist_ok
=
True
,
)
shutil
.
copy
(
dataset_path
/
input_data
[
"path"
],
dataset_path
/
output_data
[
"path"
],
)
else
:
# If the original format is not numpy, convert it to numpy.
input_set
=
read_data
(
dataset_path
/
input_data
[
"path"
],
input_data
[
"format"
],
)
save_data
(
input_set
,
dataset_path
/
output_data
[
"path"
],
output_set_per_type
[
"format"
],
)
# 8. Save the output_config.
# 8. Save the output_config.
output_config_path
=
dataset_path
/
"output_config.yaml"
output_config_path
=
dataset_path
/
"output_config.yaml"
...
@@ -245,19 +247,25 @@ class OnDiskDataset(Dataset):
...
@@ -245,19 +247,25 @@ class OnDiskDataset(Dataset):
path: edge_data/author-writes-paper-feat.npy
path: edge_data/author-writes-paper-feat.npy
train_sets:
train_sets:
- - type: paper # could be null for homogeneous graph.
- - type: paper # could be null for homogeneous graph.
format: numpy
data: # multiple data sources could be specified.
in_memory: true # If not specified, default to true.
- format: numpy
path: set/paper-train.npy
in_memory: true # If not specified, default to true.
path: set/paper-train-src.npy
- format: numpy
in_memory: false
path: set/paper-train-dst.npy
validation_sets:
validation_sets:
- - type: paper
- - type: paper
format: numpy
data:
in_memory: true
- format: numpy
path: set/paper-validation.npy
in_memory: true
path: set/paper-validation.npy
test_sets:
test_sets:
- - type: paper
- - type: paper
format: numpy
data:
in_memory: true
- format: numpy
path: set/paper-test.npy
in_memory: true
path: set/paper-test.npy
Parameters
Parameters
----------
----------
...
@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset):
...
@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset):
assert
(
assert
(
len
(
tvt_set
)
==
1
len
(
tvt_set
)
==
1
),
"Only one TVT set is allowed if type is not specified."
),
"Only one TVT set is allowed if type is not specified."
data
=
read_data
(
ret
.
append
(
tvt_set
[
0
].
path
,
tvt_set
[
0
].
format
,
tvt_set
[
0
].
in_memory
ItemSet
(
tuple
(
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt_set
[
0
].
data
)
)
)
)
ret
.
append
(
ItemSet
(
tensor_to_tuple
(
data
)))
else
:
else
:
data
=
{}
data
=
{}
for
tvt
in
tvt_set
:
for
tvt
in
tvt_set
:
data
[
tvt
.
type
]
=
ItemSet
(
data
[
tvt
.
type
]
=
ItemSet
(
tensor_to_tuple
(
tuple
(
read_data
(
tvt
.
path
,
tvt
.
format
,
tvt
.
in_memory
)
read_data
(
data
.
path
,
data
.
format
,
data
.
in_memory
)
for
data
in
tvt
.
data
)
)
)
)
ret
.
append
(
ItemSetDict
(
data
))
ret
.
append
(
ItemSetDict
(
data
))
...
...
python/dgl/graphbolt/impl/ondisk_metadata.py
View file @
14f396d0
...
@@ -8,6 +8,7 @@ import pydantic
...
@@ -8,6 +8,7 @@ import pydantic
__all__
=
[
__all__
=
[
"OnDiskFeatureDataFormat"
,
"OnDiskFeatureDataFormat"
,
"OnDiskTVTSetData"
,
"OnDiskTVTSet"
,
"OnDiskTVTSet"
,
"OnDiskFeatureDataDomain"
,
"OnDiskFeatureDataDomain"
,
"OnDiskFeatureData"
,
"OnDiskFeatureData"
,
...
@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum):
...
@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum):
NUMPY
=
"numpy"
NUMPY
=
"numpy"
class
OnDiskTVTSet
(
pydantic
.
BaseModel
):
class
OnDiskTVTSet
Data
(
pydantic
.
BaseModel
):
"""Train-Validation-Test set."""
"""Train-Validation-Test set
data
."""
type
:
Optional
[
str
]
=
None
format
:
OnDiskFeatureDataFormat
format
:
OnDiskFeatureDataFormat
in_memory
:
Optional
[
bool
]
=
True
in_memory
:
Optional
[
bool
]
=
True
path
:
str
path
:
str
class
OnDiskTVTSet
(
pydantic
.
BaseModel
):
"""Train-Validation-Test set."""
type
:
Optional
[
str
]
=
None
data
:
List
[
OnDiskTVTSetData
]
class
OnDiskFeatureDataDomain
(
str
,
Enum
):
class
OnDiskFeatureDataDomain
(
str
,
Enum
):
"""Enum of feature data domain."""
"""Enum of feature data domain."""
...
...
python/dgl/graphbolt/utils/internal.py
View file @
14f396d0
...
@@ -45,9 +45,3 @@ def save_data(data, path, fmt):
...
@@ -45,9 +45,3 @@ def save_data(data, path, fmt):
np
.
save
(
path
,
data
)
np
.
save
(
path
,
data
)
elif
fmt
==
"torch"
:
elif
fmt
==
"torch"
:
torch
.
save
(
data
,
path
)
torch
.
save
(
data
,
path
)
def
tensor_to_tuple
(
data
):
"""Split a torch.Tensor in column-wise to a tuple."""
assert
isinstance
(
data
,
torch
.
Tensor
),
"data must be a torch.Tensor"
return
tuple
(
data
.
t
())
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
View file @
14f396d0
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment