Unverified Commit 14f396d0 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] change TVT format of OnDiskDataset (#6076)

parent 17f6c4c9
...@@ -15,7 +15,7 @@ import dgl ...@@ -15,7 +15,7 @@ import dgl
from ..dataset import Dataset from ..dataset import Dataset
from ..itemset import ItemSet, ItemSetDict from ..itemset import ItemSet, ItemSetDict
from ..utils import read_data, save_data, tensor_to_tuple from ..utils import read_data, save_data
from .csc_sampling_graph import ( from .csc_sampling_graph import (
CSCSamplingGraph, CSCSamplingGraph,
...@@ -173,33 +173,35 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str: ...@@ -173,33 +173,35 @@ def preprocess_ondisk_dataset(input_config_path: str) -> str:
): ):
for input_set_per_type, output_set_per_type in zip( for input_set_per_type, output_set_per_type in zip(
intput_set_split, output_set_split intput_set_split, output_set_split
):
for input_data, output_data in zip(
input_set_per_type["data"], output_set_per_type["data"]
): ):
# Always save the feature in numpy format. # Always save the feature in numpy format.
output_set_per_type["format"] = "numpy" output_data["format"] = "numpy"
output_set_per_type["path"] = str( output_data["path"] = str(
processed_dir_prefix processed_dir_prefix
/ input_set_per_type["path"].replace("pt", "npy") / input_data["path"].replace("pt", "npy")
) )
if input_set_per_type["format"] == "numpy": if input_data["format"] == "numpy":
# If the original format is numpy, just copy the file. # If the original format is numpy, just copy the file.
os.makedirs( os.makedirs(
dataset_path dataset_path / os.path.dirname(output_data["path"]),
/ os.path.dirname(output_set_per_type["path"]),
exist_ok=True, exist_ok=True,
) )
shutil.copy( shutil.copy(
dataset_path / input_set_per_type["path"], dataset_path / input_data["path"],
dataset_path / output_set_per_type["path"], dataset_path / output_data["path"],
) )
else: else:
# If the original format is not numpy, convert it to numpy. # If the original format is not numpy, convert it to numpy.
input_set = read_data( input_set = read_data(
dataset_path / input_set_per_type["path"], dataset_path / input_data["path"],
input_set_per_type["format"], input_data["format"],
) )
save_data( save_data(
input_set, input_set,
dataset_path / output_set_per_type["path"], dataset_path / output_data["path"],
output_set_per_type["format"], output_set_per_type["format"],
) )
...@@ -245,17 +247,23 @@ class OnDiskDataset(Dataset): ...@@ -245,17 +247,23 @@ class OnDiskDataset(Dataset):
path: edge_data/author-writes-paper-feat.npy path: edge_data/author-writes-paper-feat.npy
train_sets: train_sets:
- - type: paper # could be null for homogeneous graph. - - type: paper # could be null for homogeneous graph.
format: numpy data: # multiple data sources could be specified.
- format: numpy
in_memory: true # If not specified, default to true. in_memory: true # If not specified, default to true.
path: set/paper-train.npy path: set/paper-train-src.npy
- format: numpy
in_memory: false
path: set/paper-train-dst.npy
validation_sets: validation_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
in_memory: true in_memory: true
path: set/paper-validation.npy path: set/paper-validation.npy
test_sets: test_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
in_memory: true in_memory: true
path: set/paper-test.npy path: set/paper-test.npy
...@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset): ...@@ -347,16 +355,21 @@ class OnDiskDataset(Dataset):
assert ( assert (
len(tvt_set) == 1 len(tvt_set) == 1
), "Only one TVT set is allowed if type is not specified." ), "Only one TVT set is allowed if type is not specified."
data = read_data( ret.append(
tvt_set[0].path, tvt_set[0].format, tvt_set[0].in_memory ItemSet(
tuple(
read_data(data.path, data.format, data.in_memory)
for data in tvt_set[0].data
)
)
) )
ret.append(ItemSet(tensor_to_tuple(data)))
else: else:
data = {} data = {}
for tvt in tvt_set: for tvt in tvt_set:
data[tvt.type] = ItemSet( data[tvt.type] = ItemSet(
tensor_to_tuple( tuple(
read_data(tvt.path, tvt.format, tvt.in_memory) read_data(data.path, data.format, data.in_memory)
for data in tvt.data
) )
) )
ret.append(ItemSetDict(data)) ret.append(ItemSetDict(data))
......
...@@ -8,6 +8,7 @@ import pydantic ...@@ -8,6 +8,7 @@ import pydantic
__all__ = [ __all__ = [
"OnDiskFeatureDataFormat", "OnDiskFeatureDataFormat",
"OnDiskTVTSetData",
"OnDiskTVTSet", "OnDiskTVTSet",
"OnDiskFeatureDataDomain", "OnDiskFeatureDataDomain",
"OnDiskFeatureData", "OnDiskFeatureData",
...@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum): ...@@ -24,15 +25,21 @@ class OnDiskFeatureDataFormat(str, Enum):
NUMPY = "numpy" NUMPY = "numpy"
class OnDiskTVTSet(pydantic.BaseModel): class OnDiskTVTSetData(pydantic.BaseModel):
"""Train-Validation-Test set.""" """Train-Validation-Test set data."""
type: Optional[str] = None
format: OnDiskFeatureDataFormat format: OnDiskFeatureDataFormat
in_memory: Optional[bool] = True in_memory: Optional[bool] = True
path: str path: str
class OnDiskTVTSet(pydantic.BaseModel):
"""Train-Validation-Test set."""
type: Optional[str] = None
data: List[OnDiskTVTSetData]
class OnDiskFeatureDataDomain(str, Enum): class OnDiskFeatureDataDomain(str, Enum):
"""Enum of feature data domain.""" """Enum of feature data domain."""
......
...@@ -45,9 +45,3 @@ def save_data(data, path, fmt): ...@@ -45,9 +45,3 @@ def save_data(data, path, fmt):
np.save(path, data) np.save(path, data)
elif fmt == "torch": elif fmt == "torch":
torch.save(data, path) torch.save(data, path)
def tensor_to_tuple(data):
"""Split a torch.Tensor in column-wise to a tuple."""
assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
return tuple(data.t())
...@@ -22,7 +22,8 @@ def test_OnDiskDataset_TVTSet_exceptions(): ...@@ -22,7 +22,8 @@ def test_OnDiskDataset_TVTSet_exceptions():
yaml_content = """ yaml_content = """
train_sets: train_sets:
- - type: paper - - type: paper
format: torch_invalid data:
- format: torch_invalid
path: set/paper-train.pt path: set/paper-train.pt
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
...@@ -35,10 +36,12 @@ def test_OnDiskDataset_TVTSet_exceptions(): ...@@ -35,10 +36,12 @@ def test_OnDiskDataset_TVTSet_exceptions():
yaml_content = """ yaml_content = """
train_sets: train_sets:
- - type: null - - type: null
format: numpy data:
- format: numpy
path: set/train.npy path: set/train.npy
- type: null - type: null
format: numpy data:
- format: numpy
path: set/train.npy path: set/train.npy
""" """
with open(yaml_file, "w") as f: with open(yaml_file, "w") as f:
...@@ -54,22 +57,25 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -54,22 +57,25 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
"""Test TVTSet which returns ItemSet with IDs and labels.""" """Test TVTSet which returns ItemSet with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
train_ids = np.arange(1000) train_ids = np.arange(1000)
train_ids_path = os.path.join(test_dir, "train_ids.npy")
np.save(train_ids_path, train_ids)
train_labels = np.random.randint(0, 10, size=1000) train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_ids, train_labels]).T train_labels_path = os.path.join(test_dir, "train_labels.npy")
train_path = os.path.join(test_dir, "train.npy") np.save(train_labels_path, train_labels)
np.save(train_path, train_data)
validation_ids = np.arange(1000, 2000) validation_ids = np.arange(1000, 2000)
validation_ids_path = os.path.join(test_dir, "validation_ids.npy")
np.save(validation_ids_path, validation_ids)
validation_labels = np.random.randint(0, 10, size=1000) validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_ids, validation_labels]).T validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
validation_path = os.path.join(test_dir, "validation.npy") np.save(validation_labels_path, validation_labels)
np.save(validation_path, validation_data)
test_ids = np.arange(2000, 3000) test_ids = np.arange(2000, 3000)
test_ids_path = os.path.join(test_dir, "test_ids.npy")
np.save(test_ids_path, test_ids)
test_labels = np.random.randint(0, 10, size=1000) test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_ids, test_labels]).T test_labels_path = os.path.join(test_dir, "test_labels.npy")
test_path = os.path.join(test_dir, "test.npy") np.save(test_labels_path, test_labels)
np.save(test_path, test_data)
# Case 1: # Case 1:
# all TVT sets are specified. # all TVT sets are specified.
...@@ -78,26 +84,30 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -78,26 +84,30 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
yaml_content = f""" yaml_content = f"""
train_sets: train_sets:
- - type: null - - type: null
format: numpy data:
- format: numpy
in_memory: true in_memory: true
path: {train_path} path: {train_ids_path}
- - type: null - format: numpy
format: numpy in_memory: true
path: {train_path} path: {train_labels_path}
validation_sets: validation_sets:
- - format: numpy - - data:
path: {validation_path} - format: numpy
- - type: null in_memory: true
format: numpy path: {validation_ids_path}
path: {validation_path} - format: numpy
in_memory: true
path: {validation_labels_path}
test_sets: test_sets:
- - type: null - - type: null
format: numpy data:
in_memory: false - format: numpy
path: {test_path} in_memory: true
- - type: null path: {test_ids_path}
format: numpy - format: numpy
path: {test_path} in_memory: true
path: {test_labels_path}
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f: with open(yaml_file, "w") as f:
...@@ -107,7 +117,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -107,7 +117,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# Verify train set. # Verify train set.
train_sets = dataset.train_sets train_sets = dataset.train_sets
assert len(train_sets) == 2 assert len(train_sets) == 1
for train_set in train_sets: for train_set in train_sets:
assert len(train_set) == 1000 assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet) assert isinstance(train_set, gb.ItemSet)
...@@ -118,7 +128,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -118,7 +128,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# Verify validation set. # Verify validation set.
validation_sets = dataset.validation_sets validation_sets = dataset.validation_sets
assert len(validation_sets) == 2 assert len(validation_sets) == 1
for validation_set in validation_sets: for validation_set in validation_sets:
assert len(validation_set) == 1000 assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet) assert isinstance(validation_set, gb.ItemSet)
...@@ -129,7 +139,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -129,7 +139,7 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
# Verify test set. # Verify test set.
test_sets = dataset.test_sets test_sets = dataset.test_sets
assert len(test_sets) == 2 assert len(test_sets) == 1
for test_set in test_sets: for test_set in test_sets:
assert len(test_set) == 1000 assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet) assert isinstance(test_set, gb.ItemSet)
...@@ -143,8 +153,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -143,8 +153,9 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
yaml_content = f""" yaml_content = f"""
train_sets: train_sets:
- - type: null - - type: null
format: numpy data:
path: {train_path} - format: numpy
path: {train_ids_path}
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f: with open(yaml_file, "w") as f:
...@@ -160,47 +171,72 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -160,47 +171,72 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label(): def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
"""Test TVTSet which returns ItemSet with IDs and labels.""" """Test TVTSet which returns ItemSet with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000)) train_src = np.arange(1000)
train_src_path = os.path.join(test_dir, "train_src.npy")
np.save(train_src_path, train_src)
train_dst = np.arange(1000, 2000)
train_dst_path = os.path.join(test_dir, "train_dst.npy")
np.save(train_dst_path, train_dst)
train_labels = np.random.randint(0, 10, size=1000) train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T train_labels_path = os.path.join(test_dir, "train_labels.npy")
train_path = os.path.join(test_dir, "train.npy") np.save(train_labels_path, train_labels)
np.save(train_path, train_data)
validation_src = np.arange(1000, 2000)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000)) validation_src_path = os.path.join(test_dir, "validation_src.npy")
np.save(validation_src_path, validation_src)
validation_dst = np.arange(2000, 3000)
validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
np.save(validation_dst_path, validation_dst)
validation_labels = np.random.randint(0, 10, size=1000) validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
validation_path = os.path.join(test_dir, "validation.npy") np.save(validation_labels_path, validation_labels)
np.save(validation_path, validation_data)
test_src = np.arange(2000, 3000)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000)) test_src_path = os.path.join(test_dir, "test_src.npy")
np.save(test_src_path, test_src)
test_dst = np.arange(3000, 4000)
test_dst_path = os.path.join(test_dir, "test_dst.npy")
np.save(test_dst_path, test_dst)
test_labels = np.random.randint(0, 10, size=1000) test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T test_labels_path = os.path.join(test_dir, "test_labels.npy")
test_path = os.path.join(test_dir, "test.npy") np.save(test_labels_path, test_labels)
np.save(test_path, test_data)
yaml_content = f""" yaml_content = f"""
train_sets: train_sets:
- - type: null - - type: null
format: numpy data:
- format: numpy
in_memory: true in_memory: true
path: {train_path} path: {train_src_path}
- - type: null - format: numpy
format: numpy in_memory: true
path: {train_path} path: {train_dst_path}
- format: numpy
in_memory: true
path: {train_labels_path}
validation_sets: validation_sets:
- - format: numpy - - data:
path: {validation_path} - format: numpy
- - type: null in_memory: true
format: numpy path: {validation_src_path}
path: {validation_path} - format: numpy
in_memory: true
path: {validation_dst_path}
- format: numpy
in_memory: true
path: {validation_labels_path}
test_sets: test_sets:
- - type: null - - type: null
format: numpy data:
in_memory: false - format: numpy
path: {test_path} in_memory: true
- - type: null path: {test_src_path}
format: numpy - format: numpy
path: {test_path} in_memory: true
path: {test_dst_path}
- format: numpy
in_memory: true
path: {test_labels_path}
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f: with open(yaml_file, "w") as f:
...@@ -210,42 +246,162 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label(): ...@@ -210,42 +246,162 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
# Verify train set. # Verify train set.
train_sets = dataset.train_sets train_sets = dataset.train_sets
assert len(train_sets) == 2 assert len(train_sets) == 1
for train_set in train_sets: for train_set in train_sets:
assert len(train_set) == 1000 assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet) assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(train_set): for i, (src, dst, label) in enumerate(train_set):
assert src == train_pairs[0][i] assert src == train_src[i]
assert dst == train_pairs[1][i] assert dst == train_dst[i]
assert label == train_labels[i] assert label == train_labels[i]
train_sets = None train_sets = None
# Verify validation set. # Verify validation set.
validation_sets = dataset.validation_sets validation_sets = dataset.validation_sets
assert len(validation_sets) == 2 assert len(validation_sets) == 1
for validation_set in validation_sets: for validation_set in validation_sets:
assert len(validation_set) == 1000 assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet) assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(validation_set): for i, (src, dst, label) in enumerate(validation_set):
assert src == validation_pairs[0][i] assert src == validation_src[i]
assert dst == validation_pairs[1][i] assert dst == validation_dst[i]
assert label == validation_labels[i] assert label == validation_labels[i]
validation_sets = None validation_sets = None
# Verify test set. # Verify test set.
test_sets = dataset.test_sets test_sets = dataset.test_sets
assert len(test_sets) == 2 assert len(test_sets) == 1
for test_set in test_sets: for test_set in test_sets:
assert len(test_set) == 1000 assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet) assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(test_set): for i, (src, dst, label) in enumerate(test_set):
assert src == test_pairs[0][i] assert src == test_src[i]
assert dst == test_pairs[1][i] assert dst == test_dst[i]
assert label == test_labels[i] assert label == test_labels[i]
test_sets = None test_sets = None
dataset = None dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
"""Test TVTSet which returns ItemSet with node pairs and negative ones."""
with tempfile.TemporaryDirectory() as test_dir:
train_src = np.arange(1000)
train_src_path = os.path.join(test_dir, "train_src.npy")
np.save(train_src_path, train_src)
train_dst = np.arange(1000, 2000)
train_dst_path = os.path.join(test_dir, "train_dst.npy")
np.save(train_dst_path, train_dst)
train_neg_dst = np.random.choice(1000 * 10, size=1000 * 10).reshape(
1000, 10
)
train_neg_dst_path = os.path.join(test_dir, "train_neg_dst.npy")
np.save(train_neg_dst_path, train_neg_dst)
validation_src = np.arange(1000, 2000)
validation_src_path = os.path.join(test_dir, "validation_src.npy")
np.save(validation_src_path, validation_src)
validation_dst = np.arange(2000, 3000)
validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
np.save(validation_dst_path, validation_dst)
validation_neg_dst = train_neg_dst + 1
validation_neg_dst_path = os.path.join(
test_dir, "validation_neg_dst.npy"
)
np.save(validation_neg_dst_path, validation_neg_dst)
test_src = np.arange(2000, 3000)
test_src_path = os.path.join(test_dir, "test_src.npy")
np.save(test_src_path, test_src)
test_dst = np.arange(3000, 4000)
test_dst_path = os.path.join(test_dir, "test_dst.npy")
np.save(test_dst_path, test_dst)
test_neg_dst = train_neg_dst + 2
test_neg_dst_path = os.path.join(test_dir, "test_neg_dst.npy")
np.save(test_neg_dst_path, test_neg_dst)
yaml_content = f"""
train_sets:
- - type: null
data:
- format: numpy
in_memory: true
path: {train_src_path}
- format: numpy
in_memory: true
path: {train_dst_path}
- format: numpy
in_memory: true
path: {train_neg_dst_path}
validation_sets:
- - data:
- format: numpy
in_memory: true
path: {validation_src_path}
- format: numpy
in_memory: true
path: {validation_dst_path}
- format: numpy
in_memory: true
path: {validation_neg_dst_path}
test_sets:
- - type: null
data:
- format: numpy
in_memory: true
path: {test_src_path}
- format: numpy
in_memory: true
path: {test_dst_path}
- format: numpy
in_memory: true
path: {test_neg_dst_path}
"""
yaml_file = os.path.join(test_dir, "test.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)
dataset = gb.OnDiskDataset(yaml_file)
# Verify train set.
train_sets = dataset.train_sets
assert len(train_sets) == 1
for train_set in train_sets:
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(train_set):
assert src == train_src[i]
assert dst == train_dst[i]
assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
train_sets = None
# Verify validation set.
validation_sets = dataset.validation_sets
assert len(validation_sets) == 1
for validation_set in validation_sets:
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(validation_set):
assert src == validation_src[i]
assert dst == validation_dst[i]
assert torch.equal(
negs, torch.from_numpy(validation_neg_dst[i])
)
validation_sets = None
# Verify test set.
test_sets = dataset.test_sets
assert len(test_sets) == 1
for test_set in test_sets:
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(test_set):
assert src == test_src[i]
assert dst == test_dst[i]
assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
test_sets = None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_id_label(): def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
"""Test TVTSet which returns ItemSetDict with IDs and labels.""" """Test TVTSet which returns ItemSetDict with IDs and labels."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
...@@ -270,26 +426,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label(): ...@@ -270,26 +426,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
yaml_content = f""" yaml_content = f"""
train_sets: train_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
in_memory: true in_memory: true
path: {train_path} path: {train_path}
- - type: author - - type: author
format: numpy data:
- format: numpy
path: {train_path} path: {train_path}
validation_sets: validation_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
path: {validation_path} path: {validation_path}
- - type: author - - type: author
format: numpy data:
- format: numpy
path: {validation_path} path: {validation_path}
test_sets: test_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
in_memory: false in_memory: false
path: {test_path} path: {test_path}
- - type: author - - type: author
format: numpy data:
- format: numpy
path: {test_path} path: {test_path}
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
...@@ -372,26 +534,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label(): ...@@ -372,26 +534,32 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
yaml_content = f""" yaml_content = f"""
train_sets: train_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
in_memory: true in_memory: true
path: {train_path} path: {train_path}
- - type: author - - type: author
format: numpy data:
- format: numpy
path: {train_path} path: {train_path}
validation_sets: validation_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
path: {validation_path} path: {validation_path}
- - type: author - - type: author
format: numpy data:
- format: numpy
path: {validation_path} path: {validation_path}
test_sets: test_sets:
- - type: paper - - type: paper
format: numpy data:
- format: numpy
in_memory: false in_memory: false
path: {test_path} path: {test_path}
- - type: author - - type: author
format: numpy data:
- format: numpy
path: {test_path} path: {test_path}
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
...@@ -829,16 +997,18 @@ def test_OnDiskDataset_preprocess_homogeneous(): ...@@ -829,16 +997,18 @@ def test_OnDiskDataset_preprocess_homogeneous():
path: data/node-feat.npy path: data/node-feat.npy
train_sets: train_sets:
- - type_name: null - - type_name: null
# shape: (num_trains, 3), 3 for (src, dst, label). data:
format: numpy - format: numpy
path: set/train.npy path: set/train.npy
validation_sets: validation_sets:
- - type_name: null - - type_name: null
format: numpy data:
- format: numpy
path: set/validation.npy path: set/validation.npy
test_sets: test_sets:
- - type_name: null - - type_name: null
format: numpy data:
- format: numpy
path: set/test.npy path: set/test.npy
""" """
yaml_file = os.path.join(test_dir, "test.yaml") yaml_file = os.path.join(test_dir, "test.yaml")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment