Unverified Commit 50b05723 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] update names of ItemSet in OnDiskDataset testcases (#6289)

parent 19d63943
......@@ -313,24 +313,36 @@ class OnDiskDataset(Dataset):
train_set:
- type: paper # could be null for homogeneous graph.
data: # multiple data sources could be specified.
- format: numpy
- name: node_pairs
format: numpy
in_memory: true # If not specified, default to true.
path: set/paper-train-src.npy
- format: numpy
path: set/paper-train-node_pairs.npy
- name: labels
format: numpy
in_memory: false
path: set/paper-train-dst.npy
path: set/paper-train-labels.npy
validation_set:
- type: paper
data:
- format: numpy
- name: node_pairs
format: numpy
in_memory: true
path: set/paper-validation-node_pairs.npy
- name: labels
format: numpy
in_memory: true
path: set/paper-validation.npy
path: set/paper-validation-labels.npy
test_set:
- type: paper
data:
- format: numpy
- name: node_pairs
format: numpy
in_memory: true
path: set/paper-test-node_pairs.npy
- name: labels
format: numpy
in_memory: true
path: set/paper-test.npy
path: set/paper-test-labels.npy
Parameters
----------
......
......@@ -287,35 +287,28 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
"""Test TVTSet which returns ItemSet with IDs and labels."""
def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_labels():
"""Test TVTSet which returns ItemSet with node pairs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_src = np.arange(1000)
train_src_path = os.path.join(test_dir, "train_src.npy")
np.save(train_src_path, train_src)
train_dst = np.arange(1000, 2000)
train_dst_path = os.path.join(test_dir, "train_dst.npy")
np.save(train_dst_path, train_dst)
train_node_pairs = np.arange(2000).reshape(1000, 2)
train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
np.save(train_node_pairs_path, train_node_pairs)
train_labels = np.random.randint(0, 10, size=1000)
train_labels_path = os.path.join(test_dir, "train_labels.npy")
np.save(train_labels_path, train_labels)
validation_src = np.arange(1000, 2000)
validation_src_path = os.path.join(test_dir, "validation_src.npy")
np.save(validation_src_path, validation_src)
validation_dst = np.arange(2000, 3000)
validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
np.save(validation_dst_path, validation_dst)
validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
validation_node_pairs_path = os.path.join(
test_dir, "validation_node_pairs.npy"
)
np.save(validation_node_pairs_path, validation_node_pairs)
validation_labels = np.random.randint(0, 10, size=1000)
validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
np.save(validation_labels_path, validation_labels)
test_src = np.arange(2000, 3000)
test_src_path = os.path.join(test_dir, "test_src.npy")
np.save(test_src_path, test_src)
test_dst = np.arange(3000, 4000)
test_dst_path = os.path.join(test_dir, "test_dst.npy")
np.save(test_dst_path, test_dst)
test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
np.save(test_node_pairs_path, test_node_pairs)
test_labels = np.random.randint(0, 10, size=1000)
test_labels_path = os.path.join(test_dir, "test_labels.npy")
np.save(test_labels_path, test_labels)
......@@ -326,28 +319,20 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
train_set:
- type: null
data:
- name: src
- name: node_pairs
format: numpy
in_memory: true
path: {train_src_path}
- name: dst
format: numpy
in_memory: true
path: {train_dst_path}
path: {train_node_pairs_path}
- name: labels
format: numpy
in_memory: true
path: {train_labels_path}
validation_set:
- data:
- name: src
format: numpy
in_memory: true
path: {validation_src_path}
- name: dst
- name: node_pairs
format: numpy
in_memory: true
path: {validation_dst_path}
path: {validation_node_pairs_path}
- name: labels
format: numpy
in_memory: true
......@@ -355,14 +340,10 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
test_set:
- type: null
data:
- name: src
format: numpy
in_memory: true
path: {test_src_path}
- name: dst
- name: node_pairs
format: numpy
in_memory: true
path: {test_dst_path}
path: {test_node_pairs_path}
- name: labels
format: numpy
in_memory: true
......@@ -379,70 +360,63 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
train_set = dataset.tasks[0].train_set
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(train_set):
assert src == train_src[i]
assert dst == train_dst[i]
for i, (node_pair, label) in enumerate(train_set):
assert node_pair[0] == train_node_pairs[i][0]
assert node_pair[1] == train_node_pairs[i][1]
assert label == train_labels[i]
assert train_set.names == ("src", "dst", "labels")
assert train_set.names == ("node_pairs", "labels")
train_set = None
# Verify validation set.
validation_set = dataset.tasks[0].validation_set
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(validation_set):
assert src == validation_src[i]
assert dst == validation_dst[i]
for i, (node_pair, label) in enumerate(validation_set):
assert node_pair[0] == validation_node_pairs[i][0]
assert node_pair[1] == validation_node_pairs[i][1]
assert label == validation_labels[i]
assert validation_set.names == ("src", "dst", "labels")
assert validation_set.names == ("node_pairs", "labels")
validation_set = None
# Verify test set.
test_set = dataset.tasks[0].test_set
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(test_set):
assert src == test_src[i]
assert dst == test_dst[i]
for i, (node_pair, label) in enumerate(test_set):
assert node_pair[0] == test_node_pairs[i][0]
assert node_pair[1] == test_node_pairs[i][1]
assert label == test_labels[i]
assert test_set.names == ("src", "dst", "labels")
assert test_set.names == ("node_pairs", "labels")
test_set = None
dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_negs():
"""Test TVTSet which returns ItemSet with node pairs and negative ones."""
with tempfile.TemporaryDirectory() as test_dir:
train_src = np.arange(1000)
train_src_path = os.path.join(test_dir, "train_src.npy")
np.save(train_src_path, train_src)
train_dst = np.arange(1000, 2000)
train_dst_path = os.path.join(test_dir, "train_dst.npy")
np.save(train_dst_path, train_dst)
train_node_pairs = np.arange(2000).reshape(1000, 2)
train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
np.save(train_node_pairs_path, train_node_pairs)
train_neg_dst = np.random.choice(1000 * 10, size=1000 * 10).reshape(
1000, 10
)
train_neg_dst_path = os.path.join(test_dir, "train_neg_dst.npy")
np.save(train_neg_dst_path, train_neg_dst)
validation_src = np.arange(1000, 2000)
validation_src_path = os.path.join(test_dir, "validation_src.npy")
np.save(validation_src_path, validation_src)
validation_dst = np.arange(2000, 3000)
validation_dst_path = os.path.join(test_dir, "validation_dst.npy")
np.save(validation_dst_path, validation_dst)
validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
validation_node_pairs_path = os.path.join(
test_dir, "validation_node_pairs.npy"
)
np.save(validation_node_pairs_path, validation_node_pairs)
validation_neg_dst = train_neg_dst + 1
validation_neg_dst_path = os.path.join(
test_dir, "validation_neg_dst.npy"
)
np.save(validation_neg_dst_path, validation_neg_dst)
test_src = np.arange(2000, 3000)
test_src_path = os.path.join(test_dir, "test_src.npy")
np.save(test_src_path, test_src)
test_dst = np.arange(3000, 4000)
test_dst_path = os.path.join(test_dir, "test_dst.npy")
np.save(test_dst_path, test_dst)
test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
np.save(test_node_pairs_path, test_node_pairs)
test_neg_dst = train_neg_dst + 2
test_neg_dst_path = os.path.join(test_dir, "test_neg_dst.npy")
np.save(test_neg_dst_path, test_neg_dst)
......@@ -453,44 +427,32 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
train_set:
- type: null
data:
- name: src
- name: node_pairs
format: numpy
in_memory: true
path: {train_src_path}
- name: dst
format: numpy
in_memory: true
path: {train_dst_path}
- name: negative_dst
path: {train_node_pairs_path}
- name: negative_dsts
format: numpy
in_memory: true
path: {train_neg_dst_path}
validation_set:
- data:
- name: src
format: numpy
in_memory: true
path: {validation_src_path}
- name: dst
- name: node_pairs
format: numpy
in_memory: true
path: {validation_dst_path}
- name: negative_dst
path: {validation_node_pairs_path}
- name: negative_dsts
format: numpy
in_memory: true
path: {validation_neg_dst_path}
test_set:
- type: null
data:
- name: src
format: numpy
in_memory: true
path: {test_src_path}
- name: dst
- name: node_pairs
format: numpy
in_memory: true
path: {test_dst_path}
- name: negative_dst
path: {test_node_pairs_path}
- name: negative_dsts
format: numpy
in_memory: true
path: {test_neg_dst_path}
......@@ -506,33 +468,33 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
train_set = dataset.tasks[0].train_set
assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(train_set):
assert src == train_src[i]
assert dst == train_dst[i]
for i, (node_pair, negs) in enumerate(train_set):
assert node_pair[0] == train_node_pairs[i][0]
assert node_pair[1] == train_node_pairs[i][1]
assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
assert train_set.names == ("src", "dst", "negative_dst")
assert train_set.names == ("node_pairs", "negative_dsts")
train_set = None
# Verify validation set.
validation_set = dataset.tasks[0].validation_set
assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(validation_set):
assert src == validation_src[i]
assert dst == validation_dst[i]
for i, (node_pair, negs) in enumerate(validation_set):
assert node_pair[0] == validation_node_pairs[i][0]
assert node_pair[1] == validation_node_pairs[i][1]
assert torch.equal(negs, torch.from_numpy(validation_neg_dst[i]))
assert validation_set.names == ("src", "dst", "negative_dst")
assert validation_set.names == ("node_pairs", "negative_dsts")
validation_set = None
# Verify test set.
test_set = dataset.tasks[0].test_set
assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(test_set):
assert src == test_src[i]
assert dst == test_dst[i]
for i, (node_pair, negs) in enumerate(test_set):
assert node_pair[0] == test_node_pairs[i][0]
assert node_pair[1] == test_node_pairs[i][1]
assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
assert test_set.names == ("src", "dst", "negative_dst")
assert test_set.names == ("node_pairs", "negative_dsts")
test_set = None
dataset = None
......@@ -651,65 +613,92 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
def test_OnDiskDataset_TVTSet_ItemSetDict_node_pairs_labels():
"""Test TVTSet which returns ItemSetDict with node pairs and labels."""
with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000))
train_node_pairs = np.arange(2000).reshape(1000, 2)
train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
np.save(train_node_pairs_path, train_node_pairs)
train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T
train_path = os.path.join(test_dir, "train.npy")
np.save(train_path, train_data)
train_labels_path = os.path.join(test_dir, "train_labels.npy")
np.save(train_labels_path, train_labels)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000))
validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
validation_node_pairs_path = os.path.join(
test_dir, "validation_node_pairs.npy"
)
np.save(validation_node_pairs_path, validation_node_pairs)
validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T
validation_path = os.path.join(test_dir, "validation.npy")
np.save(validation_path, validation_data)
validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
np.save(validation_labels_path, validation_labels)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000))
test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
np.save(test_node_pairs_path, test_node_pairs)
test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T
test_path = os.path.join(test_dir, "test.npy")
np.save(test_path, test_data)
test_labels_path = os.path.join(test_dir, "test_labels.npy")
np.save(test_labels_path, test_labels)
yaml_content = f"""
tasks:
- name: edge_classification
train_set:
- type: paper
- type: paper:cites:paper
data:
- name: node_pair
- name: node_pairs
format: numpy
in_memory: true
path: {train_path}
- type: author
path: {train_node_pairs_path}
- name: labels
format: numpy
in_memory: true
path: {train_labels_path}
- type: author:writes:paper
data:
- name: node_pair
- name: node_pairs
format: numpy
path: {train_path}
path: {train_node_pairs_path}
- name: labels
format: numpy
path: {train_labels_path}
validation_set:
- type: paper
- type: paper:cites:paper
data:
- name: node_pair
- name: node_pairs
format: numpy
path: {validation_path}
- type: author
path: {validation_node_pairs_path}
- name: labels
format: numpy
path: {validation_labels_path}
- type: author:writes:paper
data:
- name: node_pair
- name: node_pairs
format: numpy
path: {validation_path}
path: {validation_node_pairs_path}
- name: labels
format: numpy
path: {validation_labels_path}
test_set:
- type: paper
- type: paper:cites:paper
data:
- name: node_pair
- name: node_pairs
format: numpy
in_memory: false
path: {test_path}
- type: author
in_memory: true
path: {test_node_pairs_path}
- name: labels
format: numpy
in_memory: true
path: {test_labels_path}
- type: author:writes:paper
data:
- name: node_pair
- name: node_pairs
format: numpy
path: {test_path}
in_memory: true
path: {test_node_pairs_path}
- name: labels
format: numpy
in_memory: true
path: {test_labels_path}
"""
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
......@@ -726,12 +715,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == train_pairs[0][i % 1000]
assert dst == train_pairs[1][i % 1000]
assert key in ["paper:cites:paper", "author:writes:paper"]
node_pair, label = item[key]
assert node_pair[0] == train_node_pairs[i % 1000][0]
assert node_pair[1] == train_node_pairs[i % 1000][1]
assert label == train_labels[i % 1000]
assert train_set.names == ("node_pair",)
assert train_set.names == ("node_pairs", "labels")
train_set = None
# Verify validation set.
......@@ -742,12 +731,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == validation_pairs[0][i % 1000]
assert dst == validation_pairs[1][i % 1000]
assert key in ["paper:cites:paper", "author:writes:paper"]
node_pair, label = item[key]
assert node_pair[0] == validation_node_pairs[i % 1000][0]
assert node_pair[1] == validation_node_pairs[i % 1000][1]
assert label == validation_labels[i % 1000]
assert validation_set.names == ("node_pair",)
assert validation_set.names == ("node_pairs", "labels")
validation_set = None
# Verify test set.
......@@ -758,12 +747,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert isinstance(item, dict)
assert len(item) == 1
key = list(item.keys())[0]
assert key in ["paper", "author"]
src, dst, label = item[key]
assert src == test_pairs[0][i % 1000]
assert dst == test_pairs[1][i % 1000]
assert key in ["paper:cites:paper", "author:writes:paper"]
node_pair, label = item[key]
assert node_pair[0] == test_node_pairs[i % 1000][0]
assert node_pair[1] == test_node_pairs[i % 1000][1]
assert label == test_labels[i % 1000]
assert test_set.names == ("node_pair",)
assert test_set.names == ("node_pairs", "labels")
test_set = None
dataset = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment