Unverified Commit cad453f2 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: applying rolling hasher in prefix synthesizer (#5903)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent a337113a
......@@ -27,6 +27,7 @@ from prefix_data_generator.graph_utils import (
_remove_leaves,
_verify_tree,
)
from prefix_data_generator.hasher import RollingHasher
from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf
......@@ -103,11 +104,15 @@ class Synthesizer:
output_lens = []
for line in f:
data = json.loads(line)
hash_ids_list.append(np.array(data["hash_ids"]))
hash_ids_list.append(data["hash_ids"])
timestamps.append(int(data["timestamp"]))
input_lens.append(np.array(data["input_length"]))
input_lens.append(int(data["input_length"]))
output_lens.append(int(data["output_length"]))
# Normalize hash_ids to consecutive integers starting from 0
hasher = RollingHasher()
hash_ids_list = [hasher([(h,) for h in hash_ids]) for hash_ids in hash_ids_list]
# represent prefix-tree as directed graph
self.G = nx.DiGraph()
max_hash_id = SUPER_ROOT
......
......@@ -96,5 +96,32 @@ def test_graph_structure():
os.unlink(tmp.name)
def test_synthesize_requests_normalizes_hash_ids():
"""Test that synthesize_requests normalizes hash_ids to consecutive integers."""
block_size = 64
# Create input with non-consecutive hash_ids [5, 6]
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp:
for _ in range(2):
data = {
"timestamp": 1000,
"hash_ids": [5, 6],
"input_length": block_size * 2,
"output_length": 100,
}
json.dump(data, tmp)
tmp.write("\n")
synthesizer = Synthesizer(tmp.name, block_size=block_size)
requests = synthesizer.synthesize_requests(num_requests=2)
assert len(requests) == 2
# Both requests should have normalized hash_ids [0, 1]
for req in requests:
assert req["hash_ids"] == [0, 1], f"Expected [0, 1], got {req['hash_ids']}"
os.unlink(tmp.name)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment