Unverified Commit 661f8177 authored by peizhou001's avatar peizhou001 Committed by GitHub
Browse files

[Graphbolt]Add data format (#6075)

parent 14f396d0
...@@ -11,14 +11,30 @@ class LinkPredictionEdgeFormat(Enum): ...@@ -11,14 +11,30 @@ class LinkPredictionEdgeFormat(Enum):
in link prediction: in link prediction:
Attributes: Attributes:
CONDITIONED: Represents the 'conditioned' format where data is
structured as quadruples `[u, v, [negative heads], [negative tails]]`
indicating the source and destination nodes of positive and negative edges.
INDEPENDENT: Represents the 'independent' format where data is structured INDEPENDENT: Represents the 'independent' format where data is structured
as triples `[u, v, label]` indicating the source and destination nodes of as triples `(u, v, label)` indicating the source and destination nodes of
an edge, with a label (0 or 1) denoting it as negative or positive. an edge, with a label (0 or 1) denoting it as negative or positive.
CONDITIONED: Represents the 'conditioned' format where data is structured
as quadruples `(u, v, neg_u, neg_v)` indicating the source and destination
nodes of positive and negative edges. And 'u' with 'v' are 1D tensors with
the same shape, while 'neg_u' and 'neg_v' are 2D tensors with the same
shape.
HEAD_CONDITIONED: Represents the 'head conditioned' format where data is
structured as triples `(u, v, neg_u)`, where '(u, v)' signifies the
source and destination nodes of positive edges, while each node in
'neg_u' collaborates with 'v' to create negative edges. And 'u' and 'v' are
1D tensors with the same shape, while 'neg_u' is a 2D tensor.
TAIL_CONDITIONED: Represents the 'tail conditioned' format where data is
structured as triples `(u, v, neg_v)`, where '(u, v)' signifies the
source and destination nodes of positive edges, while 'u' collaborates
with each node in 'neg_v' to create negative edges. And 'u' and 'v' are
1D tensors with the same shape, while 'neg_v' is a 2D tensor.
""" """
CONDITIONED = "conditioned"
INDEPENDENT = "independent" INDEPENDENT = "independent"
CONDITIONED = "conditioned"
HEAD_CONDITIONED = "head_conditioned"
TAIL_CONDITIONED = "tail_conditioned"
...@@ -31,16 +31,7 @@ class UniformNegativeSampler(NegativeSampler): ...@@ -31,16 +31,7 @@ class UniformNegativeSampler(NegativeSampler):
negative_ratio : int negative_ratio : int
The proportion of negative samples to positive samples. The proportion of negative samples to positive samples.
output_format : LinkPredictionEdgeFormat output_format : LinkPredictionEdgeFormat
Determines the format of the output data: Determines the format of the output data.
- Conditioned format: Outputs data as quadruples
`[u, v, [negative heads], [negative tails]]`. Here, 'u' and 'v'
are the source and destination nodes of positive edges, while
'negative heads' and 'negative tails' refer to the source and
destination nodes of negative edges.
- Independent format: Outputs data as triples `[u, v, label]`.
In this case, 'u' and 'v' are the source and destination nodes
of an edge, and 'label' indicates whether the edge is negative
(0) or positive (1).
graph : CSCSamplingGraph graph : CSCSamplingGraph
The graph on which to perform negative sampling. The graph on which to perform negative sampling.
......
...@@ -30,16 +30,7 @@ class NegativeSampler(Mapper): ...@@ -30,16 +30,7 @@ class NegativeSampler(Mapper):
negative_ratio : int negative_ratio : int
The proportion of negative samples to positive samples. The proportion of negative samples to positive samples.
output_format : LinkPredictionEdgeFormat output_format : LinkPredictionEdgeFormat
Determines the edge format of the output data: Determines the edge format of the output data.
- Conditioned format: Outputs data as quadruples
`[u, v, [negative heads], [negative tails]]`. Here, 'u' and 'v'
are the source and destination nodes of positive edges, while
'negative heads' and 'negative tails' refer to the source and
destination nodes of negative edges.
- Independent format: Outputs data as triples `[u, v, label]`.
In this case, 'u' and 'v' are the source and destination nodes
of an edge, and 'label' indicates whether the edge is negative
(0) or positive (1).
""" """
super().__init__(datapipe, self._sample) super().__init__(datapipe, self._sample)
assert negative_ratio > 0, "Negative_ratio should be positive Integer." assert negative_ratio > 0, "Negative_ratio should be positive Integer."
...@@ -129,5 +120,15 @@ class NegativeSampler(Mapper): ...@@ -129,5 +120,15 @@ class NegativeSampler(Mapper):
neg_src = neg_src.view(-1, self.negative_ratio) neg_src = neg_src.view(-1, self.negative_ratio)
neg_dst = neg_dst.view(-1, self.negative_ratio) neg_dst = neg_dst.view(-1, self.negative_ratio)
return (pos_src, pos_dst, neg_src, neg_dst) return (pos_src, pos_dst, neg_src, neg_dst)
elif self.output_format == LinkPredictionEdgeFormat.HEAD_CONDITIONED:
pos_src, pos_dst = pos_pairs
neg_src, _ = neg_pairs
neg_src = neg_src.view(-1, self.negative_ratio)
return (pos_src, pos_dst, neg_src)
elif self.output_format == LinkPredictionEdgeFormat.TAIL_CONDITIONED:
pos_src, pos_dst = pos_pairs
_, neg_dst = neg_pairs
neg_dst = neg_dst.view(-1, self.negative_ratio)
return (pos_src, pos_dst, neg_dst)
else: else:
raise ValueError("Unsupported output format.") raise ValueError("Unsupported output format.")
...@@ -67,3 +67,65 @@ def test_NegativeSampler_Conditioned_Format(negative_ratio): ...@@ -67,3 +67,65 @@ def test_NegativeSampler_Conditioned_Format(negative_ratio):
assert neg_dst.numel() == batch_size * negative_ratio assert neg_dst.numel() == batch_size * negative_ratio
expected_src = pos_src.repeat(negative_ratio).view(-1, negative_ratio) expected_src = pos_src.repeat(negative_ratio).view(-1, negative_ratio)
assert torch.equal(expected_src, neg_src) assert torch.equal(expected_src, neg_src)
@pytest.mark.parametrize("negative_ratio", [1, 5, 10, 20])
def test_NegativeSampler_Head_Conditioned_Format(negative_ratio):
# Construct CSCSamplingGraph.
graph = gb_test_utils.rand_csc_graph(100, 0.05)
num_seeds = 30
item_set = gb.ItemSet(
(
torch.arange(0, num_seeds),
torch.arange(num_seeds, num_seeds * 2),
)
)
batch_size = 10
minibatch_sampler = gb.MinibatchSampler(item_set, batch_size=batch_size)
# Construct NegativeSampler.
negative_sampler = gb.UniformNegativeSampler(
minibatch_sampler,
negative_ratio,
gb.LinkPredictionEdgeFormat.HEAD_CONDITIONED,
graph,
)
# Perform Negative sampling.
for data in negative_sampler:
pos_src, pos_dst, neg_src = data
# Assertation
assert len(pos_src) == batch_size
assert len(pos_dst) == batch_size
assert len(neg_src) == batch_size
assert neg_src.numel() == batch_size * negative_ratio
expected_src = pos_src.repeat(negative_ratio).view(-1, negative_ratio)
assert torch.equal(expected_src, neg_src)
@pytest.mark.parametrize("negative_ratio", [1, 5, 10, 20])
def test_NegativeSampler_Tail_Conditioned_Format(negative_ratio):
# Construct CSCSamplingGraph.
graph = gb_test_utils.rand_csc_graph(100, 0.05)
num_seeds = 30
item_set = gb.ItemSet(
(
torch.arange(0, num_seeds),
torch.arange(num_seeds, num_seeds * 2),
)
)
batch_size = 10
minibatch_sampler = gb.MinibatchSampler(item_set, batch_size=batch_size)
# Construct NegativeSampler.
negative_sampler = gb.UniformNegativeSampler(
minibatch_sampler,
negative_ratio,
gb.LinkPredictionEdgeFormat.TAIL_CONDITIONED,
graph,
)
# Perform Negative sampling.
for data in negative_sampler:
pos_src, pos_dst, neg_dst = data
# Assertation
assert len(pos_src) == batch_size
assert len(pos_dst) == batch_size
assert len(neg_dst) == batch_size
assert neg_dst.numel() == batch_size * negative_ratio
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment