Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
cad453f2
Unverified
Commit
cad453f2
authored
Feb 02, 2026
by
Yan Ru Pei
Committed by
GitHub
Feb 02, 2026
Browse files
chore: applying rolling hasher in prefix synthesizer (#5903)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
a337113a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
2 deletions
+34
-2
benchmarks/prefix_data_generator/synthesizer.py
benchmarks/prefix_data_generator/synthesizer.py
+7
-2
benchmarks/prefix_data_generator/tests/test_synthesizer.py
benchmarks/prefix_data_generator/tests/test_synthesizer.py
+27
-0
No files found.
benchmarks/prefix_data_generator/synthesizer.py
View file @
cad453f2
...
@@ -27,6 +27,7 @@ from prefix_data_generator.graph_utils import (
...
@@ -27,6 +27,7 @@ from prefix_data_generator.graph_utils import (
_remove_leaves
,
_remove_leaves
,
_verify_tree
,
_verify_tree
,
)
)
from
prefix_data_generator.hasher
import
RollingHasher
from
prefix_data_generator.protocols
import
CACHE_END
,
END_NODE
,
SUPER_ROOT
from
prefix_data_generator.protocols
import
CACHE_END
,
END_NODE
,
SUPER_ROOT
from
prefix_data_generator.sampler
import
EmpiricalSampler
,
sample_from_cdf
from
prefix_data_generator.sampler
import
EmpiricalSampler
,
sample_from_cdf
...
@@ -103,11 +104,15 @@ class Synthesizer:
...
@@ -103,11 +104,15 @@ class Synthesizer:
output_lens
=
[]
output_lens
=
[]
for
line
in
f
:
for
line
in
f
:
data
=
json
.
loads
(
line
)
data
=
json
.
loads
(
line
)
hash_ids_list
.
append
(
np
.
array
(
data
[
"hash_ids"
])
)
hash_ids_list
.
append
(
data
[
"hash_ids"
])
timestamps
.
append
(
int
(
data
[
"timestamp"
]))
timestamps
.
append
(
int
(
data
[
"timestamp"
]))
input_lens
.
append
(
np
.
array
(
data
[
"input_length"
]))
input_lens
.
append
(
int
(
data
[
"input_length"
]))
output_lens
.
append
(
int
(
data
[
"output_length"
]))
output_lens
.
append
(
int
(
data
[
"output_length"
]))
# Normalize hash_ids to consecutive integers starting from 0
hasher
=
RollingHasher
()
hash_ids_list
=
[
hasher
([(
h
,)
for
h
in
hash_ids
])
for
hash_ids
in
hash_ids_list
]
# represent prefix-tree as directed graph
# represent prefix-tree as directed graph
self
.
G
=
nx
.
DiGraph
()
self
.
G
=
nx
.
DiGraph
()
max_hash_id
=
SUPER_ROOT
max_hash_id
=
SUPER_ROOT
...
...
benchmarks/prefix_data_generator/tests/test_synthesizer.py
View file @
cad453f2
...
@@ -96,5 +96,32 @@ def test_graph_structure():
...
@@ -96,5 +96,32 @@ def test_graph_structure():
os
.
unlink
(
tmp
.
name
)
os
.
unlink
(
tmp
.
name
)
def
test_synthesize_requests_normalizes_hash_ids
():
"""Test that synthesize_requests normalizes hash_ids to consecutive integers."""
block_size
=
64
# Create input with non-consecutive hash_ids [5, 6]
with
tempfile
.
NamedTemporaryFile
(
mode
=
"w"
,
suffix
=
".jsonl"
,
delete
=
False
)
as
tmp
:
for
_
in
range
(
2
):
data
=
{
"timestamp"
:
1000
,
"hash_ids"
:
[
5
,
6
],
"input_length"
:
block_size
*
2
,
"output_length"
:
100
,
}
json
.
dump
(
data
,
tmp
)
tmp
.
write
(
"
\n
"
)
synthesizer
=
Synthesizer
(
tmp
.
name
,
block_size
=
block_size
)
requests
=
synthesizer
.
synthesize_requests
(
num_requests
=
2
)
assert
len
(
requests
)
==
2
# Both requests should have normalized hash_ids [0, 1]
for
req
in
requests
:
assert
req
[
"hash_ids"
]
==
[
0
,
1
],
f
"Expected [0, 1], got
{
req
[
'hash_ids'
]
}
"
os
.
unlink
(
tmp
.
name
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment