Unverified Commit 50b05723 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] update names of ItemSet in OnDiskDataset testcases (#6289)

parent 19d63943
...@@ -313,24 +313,36 @@ class OnDiskDataset(Dataset): ...@@ -313,24 +313,36 @@ class OnDiskDataset(Dataset):
train_set: train_set:
- type: paper # could be null for homogeneous graph. - type: paper # could be null for homogeneous graph.
data: # multiple data sources could be specified. data: # multiple data sources could be specified.
- format: numpy - name: node_pairs
format: numpy
in_memory: true # If not specified, default to true. in_memory: true # If not specified, default to true.
path: set/paper-train-src.npy path: set/paper-train-node_pairs.npy
- format: numpy - name: labels
format: numpy
in_memory: false in_memory: false
path: set/paper-train-dst.npy path: set/paper-train-labels.npy
validation_set: validation_set:
- type: paper - type: paper
data: data:
- format: numpy - name: node_pairs
format: numpy
in_memory: true in_memory: true
path: set/paper-validation.npy path: set/paper-validation-node_pairs.npy
- name: labels
format: numpy
in_memory: true
path: set/paper-validation-labels.npy
test_set: test_set:
- type: paper - type: paper
data: data:
- format: numpy - name: node_pairs
format: numpy
in_memory: true
path: set/paper-test-node_pairs.npy
- name: labels
format: numpy
in_memory: true in_memory: true
path: set/paper-test.npy path: set/paper-test-labels.npy
Parameters Parameters
---------- ----------
......
...@@ -287,35 +287,28 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label(): ...@@ -287,35 +287,28 @@ def test_OnDiskDataset_TVTSet_ItemSet_id_label():
dataset = None dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label(): def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_labels():
"""Test TVTSet which returns ItemSet with IDs and labels.""" """Test TVTSet which returns ItemSet with node pairs and labels."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
train_src = np.arange(1000) train_node_pairs = np.arange(2000).reshape(1000, 2)
train_src_path = os.path.join(test_dir, "train_src.npy") train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
np.save(train_src_path, train_src) np.save(train_node_pairs_path, train_node_pairs)
train_dst = np.arange(1000, 2000)
train_dst_path = os.path.join(test_dir, "train_dst.npy")
np.save(train_dst_path, train_dst)
train_labels = np.random.randint(0, 10, size=1000) train_labels = np.random.randint(0, 10, size=1000)
train_labels_path = os.path.join(test_dir, "train_labels.npy") train_labels_path = os.path.join(test_dir, "train_labels.npy")
np.save(train_labels_path, train_labels) np.save(train_labels_path, train_labels)
validation_src = np.arange(1000, 2000) validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
validation_src_path = os.path.join(test_dir, "validation_src.npy") validation_node_pairs_path = os.path.join(
np.save(validation_src_path, validation_src) test_dir, "validation_node_pairs.npy"
validation_dst = np.arange(2000, 3000) )
validation_dst_path = os.path.join(test_dir, "validation_dst.npy") np.save(validation_node_pairs_path, validation_node_pairs)
np.save(validation_dst_path, validation_dst)
validation_labels = np.random.randint(0, 10, size=1000) validation_labels = np.random.randint(0, 10, size=1000)
validation_labels_path = os.path.join(test_dir, "validation_labels.npy") validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
np.save(validation_labels_path, validation_labels) np.save(validation_labels_path, validation_labels)
test_src = np.arange(2000, 3000) test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
test_src_path = os.path.join(test_dir, "test_src.npy") test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
np.save(test_src_path, test_src) np.save(test_node_pairs_path, test_node_pairs)
test_dst = np.arange(3000, 4000)
test_dst_path = os.path.join(test_dir, "test_dst.npy")
np.save(test_dst_path, test_dst)
test_labels = np.random.randint(0, 10, size=1000) test_labels = np.random.randint(0, 10, size=1000)
test_labels_path = os.path.join(test_dir, "test_labels.npy") test_labels_path = os.path.join(test_dir, "test_labels.npy")
np.save(test_labels_path, test_labels) np.save(test_labels_path, test_labels)
...@@ -326,28 +319,20 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label(): ...@@ -326,28 +319,20 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
train_set: train_set:
- type: null - type: null
data: data:
- name: src - name: node_pairs
format: numpy format: numpy
in_memory: true in_memory: true
path: {train_src_path} path: {train_node_pairs_path}
- name: dst
format: numpy
in_memory: true
path: {train_dst_path}
- name: labels - name: labels
format: numpy format: numpy
in_memory: true in_memory: true
path: {train_labels_path} path: {train_labels_path}
validation_set: validation_set:
- data: - data:
- name: src - name: node_pairs
format: numpy
in_memory: true
path: {validation_src_path}
- name: dst
format: numpy format: numpy
in_memory: true in_memory: true
path: {validation_dst_path} path: {validation_node_pairs_path}
- name: labels - name: labels
format: numpy format: numpy
in_memory: true in_memory: true
...@@ -355,14 +340,10 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label(): ...@@ -355,14 +340,10 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
test_set: test_set:
- type: null - type: null
data: data:
- name: src - name: node_pairs
format: numpy
in_memory: true
path: {test_src_path}
- name: dst
format: numpy format: numpy
in_memory: true in_memory: true
path: {test_dst_path} path: {test_node_pairs_path}
- name: labels - name: labels
format: numpy format: numpy
in_memory: true in_memory: true
...@@ -379,70 +360,63 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label(): ...@@ -379,70 +360,63 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_label():
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
assert len(train_set) == 1000 assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet) assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(train_set): for i, (node_pair, label) in enumerate(train_set):
assert src == train_src[i] assert node_pair[0] == train_node_pairs[i][0]
assert dst == train_dst[i] assert node_pair[1] == train_node_pairs[i][1]
assert label == train_labels[i] assert label == train_labels[i]
assert train_set.names == ("src", "dst", "labels") assert train_set.names == ("node_pairs", "labels")
train_set = None train_set = None
# Verify validation set. # Verify validation set.
validation_set = dataset.tasks[0].validation_set validation_set = dataset.tasks[0].validation_set
assert len(validation_set) == 1000 assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet) assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(validation_set): for i, (node_pair, label) in enumerate(validation_set):
assert src == validation_src[i] assert node_pair[0] == validation_node_pairs[i][0]
assert dst == validation_dst[i] assert node_pair[1] == validation_node_pairs[i][1]
assert label == validation_labels[i] assert label == validation_labels[i]
assert validation_set.names == ("src", "dst", "labels") assert validation_set.names == ("node_pairs", "labels")
validation_set = None validation_set = None
# Verify test set. # Verify test set.
test_set = dataset.tasks[0].test_set test_set = dataset.tasks[0].test_set
assert len(test_set) == 1000 assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet) assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, label) in enumerate(test_set): for i, (node_pair, label) in enumerate(test_set):
assert src == test_src[i] assert node_pair[0] == test_node_pairs[i][0]
assert dst == test_dst[i] assert node_pair[1] == test_node_pairs[i][1]
assert label == test_labels[i] assert label == test_labels[i]
assert test_set.names == ("src", "dst", "labels") assert test_set.names == ("node_pairs", "labels")
test_set = None test_set = None
dataset = None dataset = None
def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs(): def test_OnDiskDataset_TVTSet_ItemSet_node_pairs_negs():
"""Test TVTSet which returns ItemSet with node pairs and negative ones.""" """Test TVTSet which returns ItemSet with node pairs and negative ones."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
train_src = np.arange(1000) train_node_pairs = np.arange(2000).reshape(1000, 2)
train_src_path = os.path.join(test_dir, "train_src.npy") train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
np.save(train_src_path, train_src) np.save(train_node_pairs_path, train_node_pairs)
train_dst = np.arange(1000, 2000)
train_dst_path = os.path.join(test_dir, "train_dst.npy")
np.save(train_dst_path, train_dst)
train_neg_dst = np.random.choice(1000 * 10, size=1000 * 10).reshape( train_neg_dst = np.random.choice(1000 * 10, size=1000 * 10).reshape(
1000, 10 1000, 10
) )
train_neg_dst_path = os.path.join(test_dir, "train_neg_dst.npy") train_neg_dst_path = os.path.join(test_dir, "train_neg_dst.npy")
np.save(train_neg_dst_path, train_neg_dst) np.save(train_neg_dst_path, train_neg_dst)
validation_src = np.arange(1000, 2000) validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
validation_src_path = os.path.join(test_dir, "validation_src.npy") validation_node_pairs_path = os.path.join(
np.save(validation_src_path, validation_src) test_dir, "validation_node_pairs.npy"
validation_dst = np.arange(2000, 3000) )
validation_dst_path = os.path.join(test_dir, "validation_dst.npy") np.save(validation_node_pairs_path, validation_node_pairs)
np.save(validation_dst_path, validation_dst)
validation_neg_dst = train_neg_dst + 1 validation_neg_dst = train_neg_dst + 1
validation_neg_dst_path = os.path.join( validation_neg_dst_path = os.path.join(
test_dir, "validation_neg_dst.npy" test_dir, "validation_neg_dst.npy"
) )
np.save(validation_neg_dst_path, validation_neg_dst) np.save(validation_neg_dst_path, validation_neg_dst)
test_src = np.arange(2000, 3000) test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
test_src_path = os.path.join(test_dir, "test_src.npy") test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
np.save(test_src_path, test_src) np.save(test_node_pairs_path, test_node_pairs)
test_dst = np.arange(3000, 4000)
test_dst_path = os.path.join(test_dir, "test_dst.npy")
np.save(test_dst_path, test_dst)
test_neg_dst = train_neg_dst + 2 test_neg_dst = train_neg_dst + 2
test_neg_dst_path = os.path.join(test_dir, "test_neg_dst.npy") test_neg_dst_path = os.path.join(test_dir, "test_neg_dst.npy")
np.save(test_neg_dst_path, test_neg_dst) np.save(test_neg_dst_path, test_neg_dst)
...@@ -453,44 +427,32 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs(): ...@@ -453,44 +427,32 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
train_set: train_set:
- type: null - type: null
data: data:
- name: src - name: node_pairs
format: numpy format: numpy
in_memory: true in_memory: true
path: {train_src_path} path: {train_node_pairs_path}
- name: dst - name: negative_dsts
format: numpy
in_memory: true
path: {train_dst_path}
- name: negative_dst
format: numpy format: numpy
in_memory: true in_memory: true
path: {train_neg_dst_path} path: {train_neg_dst_path}
validation_set: validation_set:
- data: - data:
- name: src - name: node_pairs
format: numpy
in_memory: true
path: {validation_src_path}
- name: dst
format: numpy format: numpy
in_memory: true in_memory: true
path: {validation_dst_path} path: {validation_node_pairs_path}
- name: negative_dst - name: negative_dsts
format: numpy format: numpy
in_memory: true in_memory: true
path: {validation_neg_dst_path} path: {validation_neg_dst_path}
test_set: test_set:
- type: null - type: null
data: data:
- name: src - name: node_pairs
format: numpy
in_memory: true
path: {test_src_path}
- name: dst
format: numpy format: numpy
in_memory: true in_memory: true
path: {test_dst_path} path: {test_node_pairs_path}
- name: negative_dst - name: negative_dsts
format: numpy format: numpy
in_memory: true in_memory: true
path: {test_neg_dst_path} path: {test_neg_dst_path}
...@@ -506,33 +468,33 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs(): ...@@ -506,33 +468,33 @@ def test_OnDiskDataset_TVTSet_ItemSet_node_pair_negs():
train_set = dataset.tasks[0].train_set train_set = dataset.tasks[0].train_set
assert len(train_set) == 1000 assert len(train_set) == 1000
assert isinstance(train_set, gb.ItemSet) assert isinstance(train_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(train_set): for i, (node_pair, negs) in enumerate(train_set):
assert src == train_src[i] assert node_pair[0] == train_node_pairs[i][0]
assert dst == train_dst[i] assert node_pair[1] == train_node_pairs[i][1]
assert torch.equal(negs, torch.from_numpy(train_neg_dst[i])) assert torch.equal(negs, torch.from_numpy(train_neg_dst[i]))
assert train_set.names == ("src", "dst", "negative_dst") assert train_set.names == ("node_pairs", "negative_dsts")
train_set = None train_set = None
# Verify validation set. # Verify validation set.
validation_set = dataset.tasks[0].validation_set validation_set = dataset.tasks[0].validation_set
assert len(validation_set) == 1000 assert len(validation_set) == 1000
assert isinstance(validation_set, gb.ItemSet) assert isinstance(validation_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(validation_set): for i, (node_pair, negs) in enumerate(validation_set):
assert src == validation_src[i] assert node_pair[0] == validation_node_pairs[i][0]
assert dst == validation_dst[i] assert node_pair[1] == validation_node_pairs[i][1]
assert torch.equal(negs, torch.from_numpy(validation_neg_dst[i])) assert torch.equal(negs, torch.from_numpy(validation_neg_dst[i]))
assert validation_set.names == ("src", "dst", "negative_dst") assert validation_set.names == ("node_pairs", "negative_dsts")
validation_set = None validation_set = None
# Verify test set. # Verify test set.
test_set = dataset.tasks[0].test_set test_set = dataset.tasks[0].test_set
assert len(test_set) == 1000 assert len(test_set) == 1000
assert isinstance(test_set, gb.ItemSet) assert isinstance(test_set, gb.ItemSet)
for i, (src, dst, negs) in enumerate(test_set): for i, (node_pair, negs) in enumerate(test_set):
assert src == test_src[i] assert node_pair[0] == test_node_pairs[i][0]
assert dst == test_dst[i] assert node_pair[1] == test_node_pairs[i][1]
assert torch.equal(negs, torch.from_numpy(test_neg_dst[i])) assert torch.equal(negs, torch.from_numpy(test_neg_dst[i]))
assert test_set.names == ("src", "dst", "negative_dst") assert test_set.names == ("node_pairs", "negative_dsts")
test_set = None test_set = None
dataset = None dataset = None
...@@ -651,65 +613,92 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label(): ...@@ -651,65 +613,92 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_id_label():
dataset = None dataset = None
def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label(): def test_OnDiskDataset_TVTSet_ItemSetDict_node_pairs_labels():
"""Test TVTSet which returns ItemSetDict with node pairs and labels.""" """Test TVTSet which returns ItemSetDict with node pairs and labels."""
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
train_pairs = (np.arange(1000), np.arange(1000, 2000)) train_node_pairs = np.arange(2000).reshape(1000, 2)
train_node_pairs_path = os.path.join(test_dir, "train_node_pairs.npy")
np.save(train_node_pairs_path, train_node_pairs)
train_labels = np.random.randint(0, 10, size=1000) train_labels = np.random.randint(0, 10, size=1000)
train_data = np.vstack([train_pairs, train_labels]).T train_labels_path = os.path.join(test_dir, "train_labels.npy")
train_path = os.path.join(test_dir, "train.npy") np.save(train_labels_path, train_labels)
np.save(train_path, train_data)
validation_pairs = (np.arange(1000, 2000), np.arange(2000, 3000)) validation_node_pairs = np.arange(2000, 4000).reshape(1000, 2)
validation_node_pairs_path = os.path.join(
test_dir, "validation_node_pairs.npy"
)
np.save(validation_node_pairs_path, validation_node_pairs)
validation_labels = np.random.randint(0, 10, size=1000) validation_labels = np.random.randint(0, 10, size=1000)
validation_data = np.vstack([validation_pairs, validation_labels]).T validation_labels_path = os.path.join(test_dir, "validation_labels.npy")
validation_path = os.path.join(test_dir, "validation.npy") np.save(validation_labels_path, validation_labels)
np.save(validation_path, validation_data)
test_pairs = (np.arange(2000, 3000), np.arange(3000, 4000)) test_node_pairs = np.arange(4000, 6000).reshape(1000, 2)
test_node_pairs_path = os.path.join(test_dir, "test_node_pairs.npy")
np.save(test_node_pairs_path, test_node_pairs)
test_labels = np.random.randint(0, 10, size=1000) test_labels = np.random.randint(0, 10, size=1000)
test_data = np.vstack([test_pairs, test_labels]).T test_labels_path = os.path.join(test_dir, "test_labels.npy")
test_path = os.path.join(test_dir, "test.npy") np.save(test_labels_path, test_labels)
np.save(test_path, test_data)
yaml_content = f""" yaml_content = f"""
tasks: tasks:
- name: edge_classification - name: edge_classification
train_set: train_set:
- type: paper - type: paper:cites:paper
data: data:
- name: node_pair - name: node_pairs
format: numpy format: numpy
in_memory: true in_memory: true
path: {train_path} path: {train_node_pairs_path}
- type: author - name: labels
format: numpy
in_memory: true
path: {train_labels_path}
- type: author:writes:paper
data: data:
- name: node_pair - name: node_pairs
format: numpy format: numpy
path: {train_path} path: {train_node_pairs_path}
- name: labels
format: numpy
path: {train_labels_path}
validation_set: validation_set:
- type: paper - type: paper:cites:paper
data: data:
- name: node_pair - name: node_pairs
format: numpy format: numpy
path: {validation_path} path: {validation_node_pairs_path}
- type: author - name: labels
format: numpy
path: {validation_labels_path}
- type: author:writes:paper
data: data:
- name: node_pair - name: node_pairs
format: numpy format: numpy
path: {validation_path} path: {validation_node_pairs_path}
- name: labels
format: numpy
path: {validation_labels_path}
test_set: test_set:
- type: paper - type: paper:cites:paper
data: data:
- name: node_pair - name: node_pairs
format: numpy format: numpy
in_memory: false in_memory: true
path: {test_path} path: {test_node_pairs_path}
- type: author - name: labels
format: numpy
in_memory: true
path: {test_labels_path}
- type: author:writes:paper
data: data:
- name: node_pair - name: node_pairs
format: numpy format: numpy
path: {test_path} in_memory: true
path: {test_node_pairs_path}
- name: labels
format: numpy
in_memory: true
path: {test_labels_path}
""" """
os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True) os.makedirs(os.path.join(test_dir, "preprocessed"), exist_ok=True)
yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml") yaml_file = os.path.join(test_dir, "preprocessed/metadata.yaml")
...@@ -726,12 +715,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label(): ...@@ -726,12 +715,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert isinstance(item, dict) assert isinstance(item, dict)
assert len(item) == 1 assert len(item) == 1
key = list(item.keys())[0] key = list(item.keys())[0]
assert key in ["paper", "author"] assert key in ["paper:cites:paper", "author:writes:paper"]
src, dst, label = item[key] node_pair, label = item[key]
assert src == train_pairs[0][i % 1000] assert node_pair[0] == train_node_pairs[i % 1000][0]
assert dst == train_pairs[1][i % 1000] assert node_pair[1] == train_node_pairs[i % 1000][1]
assert label == train_labels[i % 1000] assert label == train_labels[i % 1000]
assert train_set.names == ("node_pair",) assert train_set.names == ("node_pairs", "labels")
train_set = None train_set = None
# Verify validation set. # Verify validation set.
...@@ -742,12 +731,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label(): ...@@ -742,12 +731,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert isinstance(item, dict) assert isinstance(item, dict)
assert len(item) == 1 assert len(item) == 1
key = list(item.keys())[0] key = list(item.keys())[0]
assert key in ["paper", "author"] assert key in ["paper:cites:paper", "author:writes:paper"]
src, dst, label = item[key] node_pair, label = item[key]
assert src == validation_pairs[0][i % 1000] assert node_pair[0] == validation_node_pairs[i % 1000][0]
assert dst == validation_pairs[1][i % 1000] assert node_pair[1] == validation_node_pairs[i % 1000][1]
assert label == validation_labels[i % 1000] assert label == validation_labels[i % 1000]
assert validation_set.names == ("node_pair",) assert validation_set.names == ("node_pairs", "labels")
validation_set = None validation_set = None
# Verify test set. # Verify test set.
...@@ -758,12 +747,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label(): ...@@ -758,12 +747,12 @@ def test_OnDiskDataset_TVTSet_ItemSetDict_node_pair_label():
assert isinstance(item, dict) assert isinstance(item, dict)
assert len(item) == 1 assert len(item) == 1
key = list(item.keys())[0] key = list(item.keys())[0]
assert key in ["paper", "author"] assert key in ["paper:cites:paper", "author:writes:paper"]
src, dst, label = item[key] node_pair, label = item[key]
assert src == test_pairs[0][i % 1000] assert node_pair[0] == test_node_pairs[i % 1000][0]
assert dst == test_pairs[1][i % 1000] assert node_pair[1] == test_node_pairs[i % 1000][1]
assert label == test_labels[i % 1000] assert label == test_labels[i % 1000]
assert test_set.names == ("node_pair",) assert test_set.names == ("node_pairs", "labels")
test_set = None test_set = None
dataset = None dataset = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment