chore: applying rolling hasher in prefix synthesizer (#5903)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

chore: applying rolling hasher in prefix synthesizer (#5903)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
cad453f2 · Yan Ru Pei · GitHub · a337113a · cad453f2 · cad453f2
Unverified Commit cad453f2 authored Feb 02, 2026 by Yan Ru Pei Committed by GitHub Feb 02, 2026
2 changed files
--- a/benchmarks/prefix_data_generator/synthesizer.py
+++ b/benchmarks/prefix_data_generator/synthesizer.py
@@ -27,6 +27,7 @@ from prefix_data_generator.graph_utils import (
    _remove_leaves,
    _verify_tree,
 )
+from prefix_data_generator.hasher import RollingHasher
 from prefix_data_generator.protocols import CACHE_END, END_NODE, SUPER_ROOT
 from prefix_data_generator.sampler import EmpiricalSampler, sample_from_cdf
@@ -103,11 +104,15 @@ class Synthesizer:
            output_lens = []
            for line in f:
                data = json.loads(line)
-                hash_ids_list.append(np.array(data["hash_ids"]))
+                hash_ids_list.append(data["hash_ids"])
                timestamps.append(int(data["timestamp"]))
-                input_lens.append(np.array(data["input_length"]))
+                input_lens.append(int(data["input_length"]))
                output_lens.append(int(data["output_length"]))
+        # Normalize hash_ids to consecutive integers starting from 0
+        hasher = RollingHasher()
+        hash_ids_list = [hasher([(h,) for h in hash_ids]) for hash_ids in hash_ids_list]
        # represent prefix-tree as directed graph
        self.G = nx.DiGraph()
        max_hash_id = SUPER_ROOT

--- a/benchmarks/prefix_data_generator/tests/test_synthesizer.py
+++ b/benchmarks/prefix_data_generator/tests/test_synthesizer.py
@@ -96,5 +96,32 @@ def test_graph_structure():
    os.unlink(tmp.name)
+def test_synthesize_requests_normalizes_hash_ids():
+    """Test that synthesize_requests normalizes hash_ids to consecutive integers."""
+    block_size = 64
+    # Create input with non-consecutive hash_ids [5, 6]
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as tmp:
+        for _ in range(2):
+            data = {
+                "timestamp": 1000,
+                "hash_ids": [5, 6],
+                "input_length": block_size * 2,
+                "output_length": 100,
+            }
+            json.dump(data, tmp)
+            tmp.write("\n")
+    synthesizer = Synthesizer(tmp.name, block_size=block_size)
+    requests = synthesizer.synthesize_requests(num_requests=2)
+    assert len(requests) == 2
+    # Both requests should have normalized hash_ids [0, 1]
+    for req in requests:
+        assert req["hash_ids"] == [0, 1], f"Expected [0, 1], got {req['hash_ids']}"
+    os.unlink(tmp.name)
 if __name__ == "__main__":
    unittest.main()