Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
5fbb33e7
"...text-generation-inference.git" did not exist on "5da4cfab1c211ff3e2aefbd0358f714970fb8360"
Unverified
Commit
5fbb33e7
authored
Jul 10, 2023
by
Rhett Ying
Committed by
GitHub
Jul 10, 2023
Browse files
[GraphBolt] init feature data for Dataset (#5971)
parent
70ad5083
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
197 additions
and
9 deletions
+197
-9
python/dgl/graphbolt/dataset.py
python/dgl/graphbolt/dataset.py
+2
-2
python/dgl/graphbolt/impl/ondisk_dataset.py
python/dgl/graphbolt/impl/ondisk_dataset.py
+21
-4
python/dgl/graphbolt/impl/ondisk_metadata.py
python/dgl/graphbolt/impl/ondisk_metadata.py
+4
-3
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
+170
-0
No files found.
python/dgl/graphbolt/dataset.py
View file @
5fbb33e7
"""GraphBolt Dataset."""
"""GraphBolt Dataset."""
from
typing
import
List
from
typing
import
Dict
,
List
from
.feature_store
import
FeatureStore
from
.feature_store
import
FeatureStore
from
.itemset
import
ItemSet
,
ItemSetDict
from
.itemset
import
ItemSet
,
ItemSetDict
...
@@ -47,6 +47,6 @@ class Dataset:
...
@@ -47,6 +47,6 @@ class Dataset:
"""Return the graph."""
"""Return the graph."""
raise
NotImplementedError
raise
NotImplementedError
def
feature
(
self
)
->
FeatureStore
:
def
feature
(
self
)
->
Dict
[
object
,
FeatureStore
]
:
"""Return the feature."""
"""Return the feature."""
raise
NotImplementedError
raise
NotImplementedError
python/dgl/graphbolt/impl/ondisk_dataset.py
View file @
5fbb33e7
"""GraphBolt OnDiskDataset."""
"""GraphBolt OnDiskDataset."""
from
typing
import
List
from
typing
import
Dict
,
List
,
Tuple
from
..dataset
import
Dataset
from
..dataset
import
Dataset
from
..feature_store
import
FeatureStore
from
..itemset
import
ItemSet
,
ItemSetDict
from
..itemset
import
ItemSet
,
ItemSetDict
from
..utils
import
read_data
,
tensor_to_tuple
from
..utils
import
read_data
,
tensor_to_tuple
from
.ondisk_metadata
import
OnDiskMetaData
,
OnDiskTVTSet
from
.ondisk_metadata
import
OnDiskMetaData
,
OnDiskTVTSet
from
.torch_based_feature_store
import
(
load_feature_stores
,
TorchBasedFeatureStore
,
)
__all__
=
[
"OnDiskDataset"
]
__all__
=
[
"OnDiskDataset"
]
...
@@ -24,6 +27,19 @@ class OnDiskDataset(Dataset):
...
@@ -24,6 +27,19 @@ class OnDiskDataset(Dataset):
.. code-block:: yaml
.. code-block:: yaml
feature_data:
- domain: node
type: paper
name: feat
format: numpy
in_memory: false
path: node_data/paper-feat.npy
- domain: edge
type: "author:writes:paper"
name: feat
format: numpy
in_memory: false
path: edge_data/author-writes-paper-feat.npy
train_sets:
train_sets:
- - type_name: paper # could be null for homogeneous graph.
- - type_name: paper # could be null for homogeneous graph.
format: numpy
format: numpy
...
@@ -49,6 +65,7 @@ class OnDiskDataset(Dataset):
...
@@ -49,6 +65,7 @@ class OnDiskDataset(Dataset):
def
__init__
(
self
,
path
:
str
)
->
None
:
def
__init__
(
self
,
path
:
str
)
->
None
:
with
open
(
path
,
"r"
)
as
f
:
with
open
(
path
,
"r"
)
as
f
:
self
.
_meta
=
OnDiskMetaData
.
parse_raw
(
f
.
read
(),
proto
=
"yaml"
)
self
.
_meta
=
OnDiskMetaData
.
parse_raw
(
f
.
read
(),
proto
=
"yaml"
)
self
.
_feature
=
load_feature_stores
(
self
.
_meta
.
feature_data
)
self
.
_train_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
train_sets
)
self
.
_train_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
train_sets
)
self
.
_validation_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
validation_sets
)
self
.
_validation_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
validation_sets
)
self
.
_test_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
test_sets
)
self
.
_test_sets
=
self
.
_init_tvt_sets
(
self
.
_meta
.
test_sets
)
...
@@ -69,9 +86,9 @@ class OnDiskDataset(Dataset):
...
@@ -69,9 +86,9 @@ class OnDiskDataset(Dataset):
"""Return the graph."""
"""Return the graph."""
raise
NotImplementedError
raise
NotImplementedError
def
feature
(
self
)
->
FeatureStore
:
def
feature
(
self
)
->
Dict
[
Tuple
,
TorchBased
FeatureStore
]
:
"""Return the feature."""
"""Return the feature."""
r
aise
NotImplementedError
r
eturn
self
.
_feature
def
_init_tvt_sets
(
def
_init_tvt_sets
(
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
self
,
tvt_sets
:
List
[
List
[
OnDiskTVTSet
]]
...
...
python/dgl/graphbolt/impl/ondisk_metadata.py
View file @
5fbb33e7
...
@@ -56,6 +56,7 @@ class OnDiskMetaData(pydantic_yaml.YamlModel):
...
@@ -56,6 +56,7 @@ class OnDiskMetaData(pydantic_yaml.YamlModel):
is a list of list of ``OnDiskTVTSet``.
is a list of list of ``OnDiskTVTSet``.
"""
"""
train_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
feature_data
:
Optional
[
List
[
OnDiskFeatureData
]]
=
[]
validation_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
train_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
=
[]
test_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
validation_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
=
[]
test_sets
:
Optional
[
List
[
List
[
OnDiskTVTSet
]]]
=
[]
tests/python/pytorch/graphbolt/test_ondisk_dataset.py
View file @
5fbb33e7
...
@@ -5,6 +5,7 @@ import numpy as np
...
@@ -5,6 +5,7 @@ import numpy as np
import
pydantic
import
pydantic
import
pytest
import
pytest
import
torch
from
dgl
import
graphbolt
as
gb
from
dgl
import
graphbolt
as
gb
...
@@ -446,3 +447,172 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
...
@@ -446,3 +447,172 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert
label
==
test_labels
[
i
]
assert
label
==
test_labels
[
i
]
test_sets
=
None
test_sets
=
None
dataset
=
None
dataset
=
None
def
test_OnDiskDataset_Feature_heterograph
():
"""Test Feature storage."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
# Generate node data.
node_data_paper
=
np
.
random
.
rand
(
1000
,
10
)
node_data_paper_path
=
os
.
path
.
join
(
test_dir
,
"node_data_paper.npy"
)
np
.
save
(
node_data_paper_path
,
node_data_paper
)
node_data_label
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
node_data_label_path
=
os
.
path
.
join
(
test_dir
,
"node_data_label.npy"
)
np
.
save
(
node_data_label_path
,
node_data_label
)
# Generate edge data.
edge_data_writes
=
np
.
random
.
rand
(
1000
,
10
)
edge_data_writes_path
=
os
.
path
.
join
(
test_dir
,
"edge_writes_paper.npy"
)
np
.
save
(
edge_data_writes_path
,
edge_data_writes
)
edge_data_label
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
edge_data_label_path
=
os
.
path
.
join
(
test_dir
,
"edge_data_label.npy"
)
np
.
save
(
edge_data_label_path
,
edge_data_label
)
# Generate YAML.
yaml_content
=
f
"""
feature_data:
- domain: node
type: paper
name: feat
format: numpy
in_memory: false
path:
{
node_data_paper_path
}
- domain: node
type: paper
name: label
format: numpy
in_memory: true
path:
{
node_data_label_path
}
- domain: edge
type: "author:writes:paper"
name: feat
format: numpy
in_memory: false
path:
{
edge_data_writes_path
}
- domain: edge
type: "author:writes:paper"
name: label
format: numpy
in_memory: true
path:
{
edge_data_label_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify feature data storage.
feature_data
=
dataset
.
feature
()
assert
len
(
feature_data
)
==
4
# Verify node feature data.
node_paper_feat
=
feature_data
[(
"node"
,
"paper"
,
"feat"
)]
assert
isinstance
(
node_paper_feat
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
node_paper_feat
.
read
(),
torch
.
tensor
(
node_data_paper
)
)
node_paper_label
=
feature_data
[(
"node"
,
"paper"
,
"label"
)]
assert
isinstance
(
node_paper_label
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
node_paper_label
.
read
(),
torch
.
tensor
(
node_data_label
)
)
# Verify edge feature data.
edge_writes_feat
=
feature_data
[(
"edge"
,
"author:writes:paper"
,
"feat"
)]
assert
isinstance
(
edge_writes_feat
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
edge_writes_feat
.
read
(),
torch
.
tensor
(
edge_data_writes
)
)
edge_writes_label
=
feature_data
[
(
"edge"
,
"author:writes:paper"
,
"label"
)
]
assert
isinstance
(
edge_writes_label
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
edge_writes_label
.
read
(),
torch
.
tensor
(
edge_data_label
)
)
node_paper_feat
=
None
node_paper_label
=
None
edge_writes_feat
=
None
edge_writes_label
=
None
feature_data
=
None
dataset
=
None
def
test_OnDiskDataset_Feature_homograph
():
"""Test Feature storage."""
with
tempfile
.
TemporaryDirectory
()
as
test_dir
:
# Generate node data.
node_data_feat
=
np
.
random
.
rand
(
1000
,
10
)
node_data_feat_path
=
os
.
path
.
join
(
test_dir
,
"node_data_feat.npy"
)
np
.
save
(
node_data_feat_path
,
node_data_feat
)
node_data_label
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
node_data_label_path
=
os
.
path
.
join
(
test_dir
,
"node_data_label.npy"
)
np
.
save
(
node_data_label_path
,
node_data_label
)
# Generate edge data.
edge_data_feat
=
np
.
random
.
rand
(
1000
,
10
)
edge_data_feat_path
=
os
.
path
.
join
(
test_dir
,
"edge_data_feat.npy"
)
np
.
save
(
edge_data_feat_path
,
edge_data_feat
)
edge_data_label
=
np
.
random
.
randint
(
0
,
10
,
size
=
1000
)
edge_data_label_path
=
os
.
path
.
join
(
test_dir
,
"edge_data_label.npy"
)
np
.
save
(
edge_data_label_path
,
edge_data_label
)
# Generate YAML.
# ``type`` is not specified in the YAML.
yaml_content
=
f
"""
feature_data:
- domain: node
name: feat
format: numpy
in_memory: false
path:
{
node_data_feat_path
}
- domain: node
name: label
format: numpy
in_memory: true
path:
{
node_data_label_path
}
- domain: edge
name: feat
format: numpy
in_memory: false
path:
{
edge_data_feat_path
}
- domain: edge
name: label
format: numpy
in_memory: true
path:
{
edge_data_label_path
}
"""
yaml_file
=
os
.
path
.
join
(
test_dir
,
"test.yaml"
)
with
open
(
yaml_file
,
"w"
)
as
f
:
f
.
write
(
yaml_content
)
dataset
=
gb
.
OnDiskDataset
(
yaml_file
)
# Verify feature data storage.
feature_data
=
dataset
.
feature
()
assert
len
(
feature_data
)
==
4
# Verify node feature data.
node_feat
=
feature_data
[(
"node"
,
None
,
"feat"
)]
assert
isinstance
(
node_feat
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
node_feat
.
read
(),
torch
.
tensor
(
node_data_feat
))
node_label
=
feature_data
[(
"node"
,
None
,
"label"
)]
assert
isinstance
(
node_label
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
node_label
.
read
(),
torch
.
tensor
(
node_data_label
))
# Verify edge feature data.
edge_feat
=
feature_data
[(
"edge"
,
None
,
"feat"
)]
assert
isinstance
(
edge_feat
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
edge_feat
.
read
(),
torch
.
tensor
(
edge_data_feat
))
edge_label
=
feature_data
[(
"edge"
,
None
,
"label"
)]
assert
isinstance
(
edge_label
,
gb
.
TorchBasedFeatureStore
)
assert
torch
.
equal
(
edge_label
.
read
(),
torch
.
tensor
(
edge_data_label
))
node_feat
=
None
node_label
=
None
edge_feat
=
None
edge_label
=
None
feature_data
=
None
dataset
=
None
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment