"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "39a6a2400a6d7f0869beb7cd3ce9b729a6fb3a84"
Unverified Commit cad453f2 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: applying rolling hasher in prefix synthesizer (#5903)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent a337113a
...@@ -27,6 +27,7 @@ from prefix_data_generator.graph_utils import ( ...@@ -27,6 +27,7 @@ from prefix_data_generator.graph_utils import (
_remove_leaves, _remove_leaves,
_verify_tree, _verify_tree,
) )
from prefix_data_generator.hasher import RollingHasher
from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf
...@@ -103,11 +104,15 @@ class Synthesizer: ...@@ -103,11 +104,15 @@ class Synthesizer:
output_lens = [] output_lens = []
for line in f: for line in f:
data = json.loads(line) data = json.loads(line)
hash_ids_list.append(np.array(data["hash_ids"])) hash_ids_list.append(data["hash_ids"])
timestamps.append(int(data["timestamp"])) timestamps.append(int(data["timestamp"]))
input_lens.append(np.array(data["input_length"])) input_lens.append(int(data["input_length"]))
output_lens.append(int(data["output_length"])) output_lens.append(int(data["output_length"]))
# Normalize hash_ids to consecutive integers starting from 0
hasher = RollingHasher()
hash_ids_list = [hasher([(h,) for h in hash_ids]) for hash_ids in hash_ids_list]
# represent prefix-tree as directed graph # represent prefix-tree as directed graph
self.G = nx.DiGraph() self.G = nx.DiGraph()
max_hash_id = SUPER_ROOT max_hash_id = SUPER_ROOT
......
...@@ -96,5 +96,32 @@ def test_graph_structure(): ...@@ -96,5 +96,32 @@ def test_graph_structure():
os.unlink(tmp.name) os.unlink(tmp.name)
def test_synthesize_requests_normalizes_hash_ids():
"""Test that synthesize_requests normalizes hash_ids to consecutive integers."""
block_size = 64
# Create input with non-consecutive hash_ids [5, 6]
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp:
for _ in range(2):
data = {
"timestamp": 1000,
"hash_ids": [5, 6],
"input_length": block_size * 2,
"output_length": 100,
}
json.dump(data, tmp)
tmp.write("\n")
synthesizer = Synthesizer(tmp.name, block_size=block_size)
requests = synthesizer.synthesize_requests(num_requests=2)
assert len(requests) == 2
# Both requests should have normalized hash_ids [0, 1]
for req in requests:
assert req["hash_ids"] == [0, 1], f"Expected [0, 1], got {req['hash_ids']}"
os.unlink(tmp.name)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment