Unverified Commit f849e1a6 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

fix(kv-router): migrate raw zmq paths to libzmq and refresh lockfiles (#7871)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent bfc59cd2
...@@ -14,7 +14,7 @@ version = "0.8.12" ...@@ -14,7 +14,7 @@ version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"getrandom 0.3.4", "getrandom 0.3.4",
"once_cell", "once_cell",
"serde", "serde",
...@@ -306,33 +306,6 @@ dependencies = [ ...@@ -306,33 +306,6 @@ dependencies = [
"syn 2.0.117", "syn 2.0.117",
] ]
[[package]]
name = "async_zmq"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "499c7104563d51146553fb0963f00210d8825833789e0ed270dd96aeeff6ac93"
dependencies = [
"futures",
"mio 0.6.23",
"once_cell",
"slab",
"thiserror 1.0.69",
"zmq",
]
[[package]]
name = "asynchronous-codec"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a860072022177f903e59730004fb5dc13db9275b79bb2aef7ba8ce831956c233"
dependencies = [
"bytes",
"futures-sink",
"futures-util",
"memchr",
"pin-project-lite",
]
[[package]] [[package]]
name = "atomic" name = "atomic"
version = "0.6.1" version = "0.6.1"
...@@ -356,7 +329,7 @@ checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" ...@@ -356,7 +329,7 @@ checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [ dependencies = [
"hermit-abi 0.1.19", "hermit-abi 0.1.19",
"libc", "libc",
"winapi 0.3.9", "winapi",
] ]
[[package]] [[package]]
...@@ -749,7 +722,7 @@ dependencies = [ ...@@ -749,7 +722,7 @@ dependencies = [
"arrayref", "arrayref",
"arrayvec", "arrayvec",
"cc", "cc",
"cfg-if 1.0.4", "cfg-if",
"constant_time_eq", "constant_time_eq",
"cpufeatures", "cpufeatures",
"memmap2", "memmap2",
...@@ -915,12 +888,6 @@ dependencies = [ ...@@ -915,12 +888,6 @@ dependencies = [
"target-lexicon", "target-lexicon",
] ]
[[package]]
name = "cfg-if"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.4" version = "1.0.4"
...@@ -1063,7 +1030,7 @@ version = "3.1.1" ...@@ -1063,7 +1030,7 @@ version = "3.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34"
dependencies = [ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
...@@ -1073,7 +1040,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -1073,7 +1040,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a"
dependencies = [ dependencies = [
"castaway", "castaway",
"cfg-if 1.0.4", "cfg-if",
"itoa", "itoa",
"rustversion", "rustversion",
"ryu", "ryu",
...@@ -1265,7 +1232,7 @@ version = "1.5.0" ...@@ -1265,7 +1232,7 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
] ]
[[package]] [[package]]
...@@ -1461,7 +1428,7 @@ version = "4.1.3" ...@@ -1461,7 +1428,7 @@ version = "4.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"cpufeatures", "cpufeatures",
"curve25519-dalek-derive", "curve25519-dalek-derive",
"digest", "digest",
...@@ -1606,7 +1573,7 @@ version = "5.5.3" ...@@ -1606,7 +1573,7 @@ version = "5.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"hashbrown 0.14.5", "hashbrown 0.14.5",
"lock_api", "lock_api",
"once_cell", "once_cell",
...@@ -1619,7 +1586,7 @@ version = "6.1.0" ...@@ -1619,7 +1586,7 @@ version = "6.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"crossbeam-utils", "crossbeam-utils",
"hashbrown 0.14.5", "hashbrown 0.14.5",
"lock_api", "lock_api",
...@@ -1897,7 +1864,6 @@ dependencies = [ ...@@ -1897,7 +1864,6 @@ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
"axum 0.8.4", "axum 0.8.4",
"bytes",
"dashmap 6.1.0", "dashmap 6.1.0",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
...@@ -1922,7 +1888,7 @@ dependencies = [ ...@@ -1922,7 +1888,7 @@ dependencies = [
"uuid", "uuid",
"validator", "validator",
"xxhash-rust", "xxhash-rust",
"zeromq", "zmq",
] ]
[[package]] [[package]]
...@@ -2023,7 +1989,6 @@ dependencies = [ ...@@ -2023,7 +1989,6 @@ dependencies = [
"validator", "validator",
"video-rs", "video-rs",
"xxhash-rust", "xxhash-rust",
"zeromq",
] ]
[[package]] [[package]]
...@@ -2129,7 +2094,6 @@ dependencies = [ ...@@ -2129,7 +2094,6 @@ dependencies = [
"async-once-cell", "async-once-cell",
"async-stream", "async-stream",
"async-trait", "async-trait",
"async_zmq",
"axum 0.8.4", "axum 0.8.4",
"bincode 1.3.3", "bincode 1.3.3",
"blake3", "blake3",
...@@ -2179,6 +2143,7 @@ dependencies = [ ...@@ -2179,6 +2143,7 @@ dependencies = [
"temp-env", "temp-env",
"tempfile", "tempfile",
"thiserror 2.0.18", "thiserror 2.0.18",
"tmq",
"tokio", "tokio",
"tokio-rayon", "tokio-rayon",
"tokio-stream", "tokio-stream",
...@@ -2191,7 +2156,6 @@ dependencies = [ ...@@ -2191,7 +2156,6 @@ dependencies = [
"uuid", "uuid",
"validator", "validator",
"xxhash-rust", "xxhash-rust",
"zmq",
] ]
[[package]] [[package]]
...@@ -2263,7 +2227,7 @@ version = "0.8.35" ...@@ -2263,7 +2227,7 @@ version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
] ]
[[package]] [[package]]
...@@ -2550,7 +2514,7 @@ version = "0.2.27" ...@@ -2550,7 +2514,7 @@ version = "0.2.27"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"libc", "libc",
"libredox", "libredox",
] ]
...@@ -2652,22 +2616,6 @@ dependencies = [ ...@@ -2652,22 +2616,6 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "fuchsia-zircon"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
dependencies = [
"bitflags 1.3.2",
"fuchsia-zircon-sys",
]
[[package]]
name = "fuchsia-zircon-sys"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.32" version = "0.3.32"
...@@ -2797,7 +2745,7 @@ version = "0.2.17" ...@@ -2797,7 +2745,7 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"js-sys", "js-sys",
"libc", "libc",
"wasi", "wasi",
...@@ -2810,7 +2758,7 @@ version = "0.3.4" ...@@ -2810,7 +2758,7 @@ version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"js-sys", "js-sys",
"libc", "libc",
"r-efi 5.3.0", "r-efi 5.3.0",
...@@ -2824,7 +2772,7 @@ version = "0.4.2" ...@@ -2824,7 +2772,7 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"libc", "libc",
"r-efi 6.0.0", "r-efi 6.0.0",
"wasip2", "wasip2",
...@@ -2934,7 +2882,7 @@ version = "2.7.1" ...@@ -2934,7 +2882,7 @@ version = "2.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"crunchy", "crunchy",
"num-traits", "num-traits",
"rand 0.9.2", "rand 0.9.2",
...@@ -3072,7 +3020,7 @@ version = "0.4.2" ...@@ -3072,7 +3020,7 @@ version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"libc", "libc",
"windows-link", "windows-link",
] ]
...@@ -3535,7 +3483,7 @@ version = "0.1.13" ...@@ -3535,7 +3483,7 @@ version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
] ]
[[package]] [[package]]
...@@ -3549,15 +3497,6 @@ dependencies = [ ...@@ -3549,15 +3497,6 @@ dependencies = [
"syn 2.0.117", "syn 2.0.117",
] ]
[[package]]
name = "iovec"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "ipnet" name = "ipnet"
version = "2.12.0" version = "2.12.0"
...@@ -3823,16 +3762,6 @@ dependencies = [ ...@@ -3823,16 +3762,6 @@ dependencies = [
"serde_json", "serde_json",
] ]
[[package]]
name = "kernel32-sys"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
dependencies = [
"winapi 0.2.8",
"winapi-build",
]
[[package]] [[package]]
name = "kqueue" name = "kqueue"
version = "1.1.1" version = "1.1.1"
...@@ -4102,7 +4031,7 @@ version = "0.8.9" ...@@ -4102,7 +4031,7 @@ version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"windows-link", "windows-link",
] ]
...@@ -4112,7 +4041,7 @@ version = "0.9.0" ...@@ -4112,7 +4041,7 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"windows-link", "windows-link",
] ]
...@@ -4322,7 +4251,7 @@ version = "0.1.1" ...@@ -4322,7 +4251,7 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"rayon", "rayon",
] ]
...@@ -4332,7 +4261,7 @@ version = "0.10.6" ...@@ -4332,7 +4261,7 @@ version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"digest", "digest",
] ]
...@@ -4438,25 +4367,6 @@ dependencies = [ ...@@ -4438,25 +4367,6 @@ dependencies = [
"web-time", "web-time",
] ]
[[package]]
name = "mio"
version = "0.6.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4"
dependencies = [
"cfg-if 0.1.10",
"fuchsia-zircon",
"fuchsia-zircon-sys",
"iovec",
"kernel32-sys",
"libc",
"log",
"miow",
"net2",
"slab",
"winapi 0.2.8",
]
[[package]] [[package]]
name = "mio" name = "mio"
version = "0.8.11" version = "0.8.11"
...@@ -4480,18 +4390,6 @@ dependencies = [ ...@@ -4480,18 +4390,6 @@ dependencies = [
"windows-sys 0.61.2", "windows-sys 0.61.2",
] ]
[[package]]
name = "miow"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d"
dependencies = [
"kernel32-sys",
"net2",
"winapi 0.2.8",
"ws2_32-sys",
]
[[package]] [[package]]
name = "mockito" name = "mockito"
version = "1.7.2" version = "1.7.2"
...@@ -4685,17 +4583,6 @@ dependencies = [ ...@@ -4685,17 +4583,6 @@ dependencies = [
"syn 2.0.117", "syn 2.0.117",
] ]
[[package]]
name = "net2"
version = "0.2.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b13b648036a2339d06de780866fbdfda0dde886de7b3af2ddeba8b14f4ee34ac"
dependencies = [
"cfg-if 0.1.10",
"libc",
"winapi 0.3.9",
]
[[package]] [[package]]
name = "new_debug_unreachable" name = "new_debug_unreachable"
version = "1.0.6" version = "1.0.6"
...@@ -4709,7 +4596,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -4709,7 +4596,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
dependencies = [ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
"cfg-if 1.0.4", "cfg-if",
"libc", "libc",
"memoffset", "memoffset",
"pin-utils", "pin-utils",
...@@ -4722,7 +4609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -4722,7 +4609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
dependencies = [ dependencies = [
"bitflags 2.11.0", "bitflags 2.11.0",
"cfg-if 1.0.4", "cfg-if",
"cfg_aliases", "cfg_aliases",
"libc", "libc",
] ]
...@@ -5398,7 +5285,7 @@ version = "0.9.12" ...@@ -5398,7 +5285,7 @@ version = "0.9.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"libc", "libc",
"redox_syscall 0.5.18", "redox_syscall 0.5.18",
"smallvec", "smallvec",
...@@ -5817,7 +5704,7 @@ version = "0.14.0" ...@@ -5817,7 +5704,7 @@ version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ca5326d8d0b950a9acd87e6a3f94745394f62e4dae1b1ee22b2bc0c394af43a" checksum = "3ca5326d8d0b950a9acd87e6a3f94745394f62e4dae1b1ee22b2bc0c394af43a"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"fnv", "fnv",
"lazy_static", "lazy_static",
"memchr", "memchr",
...@@ -6208,7 +6095,7 @@ dependencies = [ ...@@ -6208,7 +6095,7 @@ dependencies = [
"av1-grain", "av1-grain",
"bitstream-io", "bitstream-io",
"built", "built",
"cfg-if 1.0.4", "cfg-if",
"interpolate_name", "interpolate_name",
"itertools 0.14.0", "itertools 0.14.0",
"libc", "libc",
...@@ -6477,7 +6364,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -6477,7 +6364,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
dependencies = [ dependencies = [
"cc", "cc",
"cfg-if 1.0.4", "cfg-if",
"getrandom 0.2.17", "getrandom 0.2.17",
"libc", "libc",
"untrusted", "untrusted",
...@@ -6570,7 +6457,7 @@ version = "0.18.2" ...@@ -6570,7 +6457,7 @@ version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d428f8247852f894ee1be110b375111b586d4fa431f6c46e64ba5a0dcccbe605" checksum = "d428f8247852f894ee1be110b375111b586d4fa431f6c46e64ba5a0dcccbe605"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"glob", "glob",
"proc-macro2", "proc-macro2",
"quote", "quote",
...@@ -6587,7 +6474,7 @@ version = "0.23.0" ...@@ -6587,7 +6474,7 @@ version = "0.23.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "825ea780781b15345a146be27eaefb05085e337e869bff01b4306a4fd4a9ad5a" checksum = "825ea780781b15345a146be27eaefb05085e337e869bff01b4306a4fd4a9ad5a"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"glob", "glob",
"proc-macro-crate", "proc-macro-crate",
"proc-macro2", "proc-macro2",
...@@ -6605,7 +6492,7 @@ version = "0.25.0" ...@@ -6605,7 +6492,7 @@ version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746" checksum = "1f168d99749d307be9de54d23fd226628d99768225ef08f6ffb52e0182a27746"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"glob", "glob",
"proc-macro-crate", "proc-macro-crate",
"proc-macro2", "proc-macro2",
...@@ -6623,7 +6510,7 @@ version = "0.26.1" ...@@ -6623,7 +6510,7 @@ version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0" checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"glob", "glob",
"proc-macro-crate", "proc-macro-crate",
"proc-macro2", "proc-macro2",
...@@ -6686,7 +6573,7 @@ version = "0.21.3" ...@@ -6686,7 +6573,7 @@ version = "0.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7" checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"ordered-multimap", "ordered-multimap",
] ]
...@@ -7268,7 +7155,7 @@ version = "0.10.6" ...@@ -7268,7 +7155,7 @@ version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"cpufeatures", "cpufeatures",
"digest", "digest",
] ]
...@@ -7279,7 +7166,7 @@ version = "0.10.9" ...@@ -7279,7 +7166,7 @@ version = "0.10.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"cpufeatures", "cpufeatures",
"digest", "digest",
] ]
...@@ -7413,7 +7300,7 @@ checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" ...@@ -7413,7 +7300,7 @@ checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b"
dependencies = [ dependencies = [
"byteorder", "byteorder",
"libc", "libc",
"winapi 0.3.9", "winapi",
] ]
[[package]] [[package]]
...@@ -7466,7 +7353,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -7466,7 +7353,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cffa8a2e517b4e9f270c47e1c4120df90506d9451c1efa67e3698d66446d30ce" checksum = "cffa8a2e517b4e9f270c47e1c4120df90506d9451c1efa67e3698d66446d30ce"
dependencies = [ dependencies = [
"libc", "libc",
"winapi 0.3.9", "winapi",
] ]
[[package]] [[package]]
...@@ -7689,7 +7576,7 @@ version = "1.1.9" ...@@ -7689,7 +7576,7 @@ version = "1.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
] ]
[[package]] [[package]]
...@@ -7959,7 +7846,6 @@ checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" ...@@ -7959,7 +7846,6 @@ checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
dependencies = [ dependencies = [
"bytes", "bytes",
"futures-core", "futures-core",
"futures-io",
"futures-sink", "futures-sink",
"futures-util", "futures-util",
"pin-project-lite", "pin-project-lite",
...@@ -8919,7 +8805,7 @@ version = "0.2.114" ...@@ -8919,7 +8805,7 @@ version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"once_cell", "once_cell",
"rustversion", "rustversion",
"wasm-bindgen-macro", "wasm-bindgen-macro",
...@@ -8932,7 +8818,7 @@ version = "0.4.64" ...@@ -8932,7 +8818,7 @@ version = "0.4.64"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"futures-util", "futures-util",
"js-sys", "js-sys",
"once_cell", "once_cell",
...@@ -9063,12 +8949,6 @@ version = "0.1.12" ...@@ -9063,12 +8949,6 @@ version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
[[package]]
name = "winapi"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
[[package]] [[package]]
name = "winapi" name = "winapi"
version = "0.3.9" version = "0.3.9"
...@@ -9079,12 +8959,6 @@ dependencies = [ ...@@ -9079,12 +8959,6 @@ dependencies = [
"winapi-x86_64-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu",
] ]
[[package]]
name = "winapi-build"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"
[[package]] [[package]]
name = "winapi-i686-pc-windows-gnu" name = "winapi-i686-pc-windows-gnu"
version = "0.4.0" version = "0.4.0"
...@@ -9097,7 +8971,7 @@ version = "0.1.11" ...@@ -9097,7 +8971,7 @@ version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
dependencies = [ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.61.2",
] ]
[[package]] [[package]]
...@@ -9422,7 +9296,7 @@ version = "0.50.0" ...@@ -9422,7 +9296,7 @@ version = "0.50.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if",
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
...@@ -9526,16 +9400,6 @@ version = "0.6.2" ...@@ -9526,16 +9400,6 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "ws2_32-sys"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e"
dependencies = [
"winapi 0.2.8",
"winapi-build",
]
[[package]] [[package]]
name = "xxhash-rust" name = "xxhash-rust"
version = "0.8.15" version = "0.8.15"
...@@ -9635,33 +9499,6 @@ version = "1.8.2" ...@@ -9635,33 +9499,6 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
[[package]]
name = "zeromq"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a4528179201f6eecf211961a7d3276faa61554c82651ecc66387f68fc3004bd"
dependencies = [
"async-trait",
"asynchronous-codec",
"bytes",
"crossbeam-queue",
"dashmap 5.5.3",
"futures-channel",
"futures-io",
"futures-task",
"futures-util",
"log",
"num-traits",
"once_cell",
"parking_lot",
"rand 0.8.5",
"regex",
"thiserror 1.0.69",
"tokio",
"tokio-util",
"uuid",
]
[[package]] [[package]]
name = "zeromq-src" name = "zeromq-src"
version = "0.2.6+4.3.4" version = "0.2.6+4.3.4"
......
...@@ -64,7 +64,6 @@ anyhow = { version = "1" } ...@@ -64,7 +64,6 @@ anyhow = { version = "1" }
async-nats = { version = "0.45.0", features = ["service"] } async-nats = { version = "0.45.0", features = ["service"] }
async-stream = { version = "0.3" } async-stream = { version = "0.3" }
async-trait = { version = "0.1" } async-trait = { version = "0.1" }
async_zmq = { version = "0.4.0" }
blake3 = { version = "1" } blake3 = { version = "1" }
bytes = { version = "1" } bytes = { version = "1" }
chrono = { version = "0.4", default-features = false, features = [ chrono = { version = "0.4", default-features = false, features = [
...@@ -74,7 +73,7 @@ chrono = { version = "0.4", default-features = false, features = [ ...@@ -74,7 +73,7 @@ chrono = { version = "0.4", default-features = false, features = [
"now", "now",
"serde", "serde",
] } ] }
cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"] } cudarc = { version = "=0.19.3", features = ["cuda-version-from-build-system", "fallback-latest"] }
dashmap = { version = "6.1" } dashmap = { version = "6.1" }
derive_builder = { version = "0.20" } derive_builder = { version = "0.20" }
derive-getters = { version = "0.5" } derive-getters = { version = "0.5" }
...@@ -115,7 +114,6 @@ strum = { version = "0.27", features = ["derive"] } ...@@ -115,7 +114,6 @@ strum = { version = "0.27", features = ["derive"] }
tempfile = "3" tempfile = "3"
thiserror = { version = "2.0.17" } thiserror = { version = "2.0.17" }
tmq = { version = "0.5.0" } tmq = { version = "0.5.0" }
zmq = { version = "0.10" }
tokio = { version = "=1.48.0", features = ["full"] } tokio = { version = "=1.48.0", features = ["full"] }
tokio-stream = { version = "0.1" } tokio-stream = { version = "0.1" }
tokio-util = { version = "0.7.17", features = ["codec", "net", "rt", "io-util"] } tokio-util = { version = "0.7.17", features = ["codec", "net", "rt", "io-util"] }
......
This diff is collapsed.
...@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = ...@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features =
] } ] }
dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true } dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"], optional = true } cudarc = { version = "=0.19.3", features = ["cuda-version-from-build-system", "fallback-latest"], optional = true }
[dev-dependencies] [dev-dependencies]
This diff is collapsed.
...@@ -17,7 +17,7 @@ default = [] ...@@ -17,7 +17,7 @@ default = []
metrics = ["dep:prometheus"] metrics = ["dep:prometheus"]
runtime-protocols = ["dep:dynamo-runtime"] runtime-protocols = ["dep:dynamo-runtime"]
bench = [] bench = []
standalone-indexer = ["dep:axum", "dep:bytes", "dep:zeromq", "dep:serde_json", "dep:reqwest"] standalone-indexer = ["dep:axum", "dep:serde_json", "dep:reqwest", "dep:zmq"]
indexer-runtime = ["metrics", "runtime-protocols", "standalone-indexer"] indexer-runtime = ["metrics", "runtime-protocols", "standalone-indexer"]
[dependencies] [dependencies]
...@@ -40,6 +40,7 @@ thiserror = { workspace = true } ...@@ -40,6 +40,7 @@ thiserror = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }
tokio-util = { workspace = true } tokio-util = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
zmq = { version = "0.10", optional = true }
uuid = { workspace = true } uuid = { workspace = true }
validator = { workspace = true } validator = { workspace = true }
xxhash-rust = { workspace = true } xxhash-rust = { workspace = true }
...@@ -53,9 +54,7 @@ rustc-hash = "2.1.1" ...@@ -53,9 +54,7 @@ rustc-hash = "2.1.1"
# standalone-indexer (optional) # standalone-indexer (optional)
axum = { workspace = true, optional = true } axum = { workspace = true, optional = true }
bytes = { workspace = true, optional = true }
reqwest = { workspace = true, optional = true } reqwest = { workspace = true, optional = true }
zeromq = { version = "0.4.1", optional = true }
[dev-dependencies] [dev-dependencies]
rstest = "0.18.2" rstest = "0.18.2"
......
...@@ -3,13 +3,10 @@ ...@@ -3,13 +3,10 @@
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::time::Duration;
use bytes::Bytes;
use rmp_serde as rmps; use rmp_serde as rmps;
use tokio::sync::watch; use tokio::sync::watch;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use zeromq::{DealerSocket, Socket, SocketRecv, SocketSend, SubSocket};
use crate::protocols::{WorkerId, WorkerWithDpRank}; use crate::protocols::{WorkerId, WorkerWithDpRank};
use crate::recovery::{CursorObservation, CursorState}; use crate::recovery::{CursorObservation, CursorState};
...@@ -17,18 +14,10 @@ use crate::zmq_wire::{KvEventBatch, convert_event}; ...@@ -17,18 +14,10 @@ use crate::zmq_wire::{KvEventBatch, convert_event};
use super::indexer::Indexer; use super::indexer::Indexer;
use super::registry::ListenerRecord; use super::registry::ListenerRecord;
use super::zmq::{
const INITIAL_BACKOFF_MS: u64 = 10; MultipartMessage, SharedSocket, connect_dealer_socket, connect_sub_socket, recv_multipart,
const MAX_BACKOFF_MS: u64 = 5000; send_multipart,
const MAX_CONSECUTIVE_ERRORS: u32 = 10; };
const MAX_BACKOFF_EXPONENT: u32 = 8;
fn calculate_backoff_ms(consecutive_errors: u32) -> u64 {
std::cmp::min(
INITIAL_BACKOFF_MS * 2_u64.pow(consecutive_errors.min(MAX_BACKOFF_EXPONENT)),
MAX_BACKOFF_MS,
)
}
const WATERMARK_UNSET: u64 = u64::MAX; const WATERMARK_UNSET: u64 = u64::MAX;
...@@ -46,11 +35,10 @@ struct ListenerLoop { ...@@ -46,11 +35,10 @@ struct ListenerLoop {
block_size: u32, block_size: u32,
indexer: Indexer, indexer: Indexer,
cancel: CancellationToken, cancel: CancellationToken,
socket: SubSocket, live_socket: SharedSocket,
replay_socket: Option<DealerSocket>, replay_socket: Option<SharedSocket>,
watermark: Arc<AtomicU64>, watermark: Arc<AtomicU64>,
warning_count: Arc<AtomicU32>, warning_count: Arc<AtomicU32>,
consecutive_errors: u32,
messages_processed: u64, messages_processed: u64,
} }
...@@ -62,8 +50,8 @@ impl ListenerLoop { ...@@ -62,8 +50,8 @@ impl ListenerLoop {
block_size: u32, block_size: u32,
indexer: Indexer, indexer: Indexer,
cancel: CancellationToken, cancel: CancellationToken,
socket: SubSocket, live_socket: SharedSocket,
replay_socket: Option<DealerSocket>, replay_socket: Option<SharedSocket>,
watermark: Arc<AtomicU64>, watermark: Arc<AtomicU64>,
) -> Self { ) -> Self {
Self { Self {
...@@ -72,11 +60,10 @@ impl ListenerLoop { ...@@ -72,11 +60,10 @@ impl ListenerLoop {
block_size, block_size,
indexer, indexer,
cancel, cancel,
socket, live_socket,
replay_socket, replay_socket,
watermark, watermark,
warning_count: Arc::new(AtomicU32::new(0)), warning_count: Arc::new(AtomicU32::new(0)),
consecutive_errors: 0,
messages_processed: 0, messages_processed: 0,
} }
} }
...@@ -94,7 +81,7 @@ impl ListenerLoop { ...@@ -94,7 +81,7 @@ impl ListenerLoop {
"Requesting replay from engine" "Requesting replay from engine"
); );
let Some(replay_socket) = self.replay_socket.as_mut() else { let Some(replay_socket) = self.replay_socket.as_ref() else {
tracing::warn!( tracing::warn!(
self.worker_id, self.worker_id,
self.dp_rank, self.dp_rank,
...@@ -111,21 +98,25 @@ impl ListenerLoop { ...@@ -111,21 +98,25 @@ impl ListenerLoop {
let warning_count = &self.warning_count; let warning_count = &self.warning_count;
let watermark = &self.watermark; let watermark = &self.watermark;
let req_frames = vec![Bytes::new(), Bytes::from(start_seq.to_be_bytes().to_vec())]; let req_frames = vec![Vec::new(), start_seq.to_be_bytes().to_vec()];
let Ok(req_msg) = zeromq::ZmqMessage::try_from(req_frames) else { if let Err(error) = send_multipart(replay_socket, req_frames).await {
tracing::error!(worker_id, dp_rank, "Failed to build replay request");
return 0;
};
if let Err(error) = replay_socket.send(req_msg).await {
tracing::error!(worker_id, dp_rank, error = %error, "Failed to send replay request"); tracing::error!(worker_id, dp_rank, error = %error, "Failed to send replay request");
return 0; return 0;
} }
let mut replayed = 0u64; let mut replayed = 0u64;
loop { loop {
let Ok(msg) = replay_socket.recv().await else { let msg = tokio::select! {
tracing::error!(worker_id, dp_rank, "Replay recv error"); _ = self.cancel.cancelled() => break,
break; result = recv_multipart(replay_socket) => {
match result {
Ok(msg) => msg,
Err(error) => {
tracing::error!(worker_id, dp_rank, error = %error, "Replay recv error");
break;
}
}
}
}; };
if msg.len() < 3 { if msg.len() < 3 {
tracing::warn!( tracing::warn!(
...@@ -251,7 +242,7 @@ impl ListenerLoop { ...@@ -251,7 +242,7 @@ impl ListenerLoop {
self.watermark.store(seq, Ordering::Release); self.watermark.store(seq, Ordering::Release);
} }
async fn handle_message(&mut self, msg: zeromq::ZmqMessage) { async fn handle_message(&mut self, msg: MultipartMessage) {
if msg.len() != 3 { if msg.len() != 3 {
tracing::warn!( tracing::warn!(
self.worker_id, self.worker_id,
...@@ -299,34 +290,15 @@ impl ListenerLoop { ...@@ -299,34 +290,15 @@ impl ListenerLoop {
return Ok(()); return Ok(());
} }
msg_result = self.socket.recv() => { result = recv_multipart(&self.live_socket) => {
match msg_result { match result {
Ok(msg) => { Ok(msg) => msg,
self.consecutive_errors = 0;
msg
}
Err(error) => { Err(error) => {
self.consecutive_errors += 1; return Err(format!(
"ZMQ recv failed for worker {} dp_rank {}: {error}",
if self.consecutive_errors >= MAX_CONSECUTIVE_ERRORS { self.worker_id,
return Err(format!( self.dp_rank,
"too many consecutive ZMQ recv errors for worker {} dp_rank {}: {error}", ));
self.worker_id,
self.dp_rank,
));
}
let backoff_ms = calculate_backoff_ms(self.consecutive_errors);
tracing::warn!(
error = %error,
consecutive_errors = self.consecutive_errors,
backoff_ms,
worker_id = self.worker_id,
dp_rank = self.dp_rank,
"ZMQ recv error, backing off"
);
tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
continue;
} }
} }
} }
...@@ -382,18 +354,8 @@ async fn run_listener( ...@@ -382,18 +354,8 @@ async fn run_listener(
return Ok(()); return Ok(());
} }
let mut socket = SubSocket::new(); let socket = connect_sub_socket(&endpoint)
socket .map_err(|e| format!("failed to connect ZMQ SUB socket to {endpoint}: {e}"))?;
.subscribe("")
.await
.map_err(|e| format!("failed to subscribe on ZMQ socket: {e}"))?;
tokio::select! {
_ = cancel.cancelled() => return Ok(()),
result = socket.connect(&endpoint) => {
result.map_err(|e| format!("failed to connect ZMQ SUB socket to {endpoint}: {e}"))?;
}
}
tokio::select! { tokio::select! {
_ = cancel.cancelled() => return Ok(()), _ = cancel.cancelled() => return Ok(()),
...@@ -438,33 +400,31 @@ async fn connect_replay_socket( ...@@ -438,33 +400,31 @@ async fn connect_replay_socket(
dp_rank: u32, dp_rank: u32,
replay_endpoint: Option<&str>, replay_endpoint: Option<&str>,
cancel: &CancellationToken, cancel: &CancellationToken,
) -> Option<DealerSocket> { ) -> Option<SharedSocket> {
let endpoint = replay_endpoint?; let endpoint = replay_endpoint?;
let mut socket = DealerSocket::new(); if cancel.is_cancelled() {
tokio::select! { return None;
_ = cancel.cancelled() => None, }
result = socket.connect(endpoint) => {
match result { match connect_dealer_socket(endpoint) {
Ok(()) => { Ok(socket) => {
tracing::info!( tracing::info!(
worker_id, worker_id,
dp_rank, dp_rank,
replay_endpoint = endpoint, replay_endpoint = endpoint,
"Replay socket connected" "Replay socket connected"
); );
Some(socket) Some(socket)
} }
Err(e) => { Err(error) => {
tracing::error!( tracing::error!(
worker_id, worker_id,
dp_rank, dp_rank,
error = %e, error = %error,
"Failed to connect replay socket to {endpoint}" "Failed to connect replay socket to {endpoint}"
); );
None None
}
}
} }
} }
} }
...@@ -473,7 +433,9 @@ async fn connect_replay_socket( ...@@ -473,7 +433,9 @@ async fn connect_replay_socket(
mod tests { mod tests {
use super::{WATERMARK_UNSET, cursor_from_watermark}; use super::{WATERMARK_UNSET, cursor_from_watermark};
use crate::recovery::CursorObservation; use crate::recovery::CursorObservation;
use zeromq::{PubSocket, Socket, SocketRecv, SocketSend, SubSocket}; use crate::standalone_indexer::zmq::{
SharedSocket, bind_pub_socket, connect_sub_socket, recv_multipart, send_multipart,
};
#[test] #[test]
fn initial_gap_replays_from_zero_and_replayed_seq_becomes_stale() { fn initial_gap_replays_from_zero_and_replayed_seq_becomes_stale() {
...@@ -494,23 +456,27 @@ mod tests { ...@@ -494,23 +456,27 @@ mod tests {
#[tokio::test(flavor = "multi_thread", worker_threads = 2)] #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn zmq_buffers_messages_during_brief_delay() { async fn zmq_buffers_messages_during_brief_delay() {
let mut pub_socket = PubSocket::new(); let reserved_listener = reserve_open_port();
let bound_endpoint = pub_socket.bind("tcp://127.0.0.1:0").await.unwrap(); let endpoint = format!(
"tcp://127.0.0.1:{}",
let mut sub_socket = SubSocket::new(); reserved_listener
sub_socket.subscribe("").await.unwrap(); .local_addr()
sub_socket .expect("failed to read reserved listener address")
.connect(&bound_endpoint.to_string()) .port()
.await );
.unwrap(); drop(reserved_listener);
let pub_socket = bind_pub_socket(&endpoint).unwrap();
let mut sub_socket = connect_sub_socket(&endpoint).unwrap();
let (tx, mut rx) = tokio::sync::mpsc::channel::<SubSocket>(1); let (tx, mut rx) = tokio::sync::mpsc::channel::<SharedSocket>(1);
tokio::spawn(async move { tokio::spawn(async move {
let _ = sub_socket.recv().await.unwrap(); let _ = recv_multipart(&sub_socket).await.unwrap();
let _ = tx.send(sub_socket).await; let _ = tx.send(sub_socket).await;
}); });
loop { loop {
pub_socket.send("probe".into()).await.unwrap(); send_multipart(&pub_socket, vec![b"probe".to_vec()])
.await
.unwrap();
tokio::time::sleep(std::time::Duration::from_millis(50)).await; tokio::time::sleep(std::time::Duration::from_millis(50)).await;
if let Ok(sub) = rx.try_recv() { if let Ok(sub) = rx.try_recv() {
sub_socket = sub; sub_socket = sub;
...@@ -521,8 +487,7 @@ mod tests { ...@@ -521,8 +487,7 @@ mod tests {
let num_messages = 10u64; let num_messages = 10u64;
for i in 0..num_messages { for i in 0..num_messages {
pub_socket send_multipart(&pub_socket, vec![i.to_le_bytes().to_vec()])
.send(i.to_le_bytes().to_vec().into())
.await .await
.unwrap(); .unwrap();
} }
...@@ -530,14 +495,55 @@ mod tests { ...@@ -530,14 +495,55 @@ mod tests {
tokio::time::sleep(std::time::Duration::from_millis(500)).await; tokio::time::sleep(std::time::Duration::from_millis(500)).await;
for i in 0u64..num_messages { for i in 0u64..num_messages {
let msg = tokio::time::timeout(std::time::Duration::from_secs(5), sub_socket.recv()) let msg = tokio::time::timeout(
.await std::time::Duration::from_secs(5),
.expect("timed out waiting for ZMQ message") recv_multipart(&sub_socket),
.expect("ZMQ recv error"); )
.await
.expect("timed out waiting for ZMQ message")
.unwrap();
let payload = msg.get(0).unwrap(); let payload = msg.first().unwrap();
let received = u64::from_le_bytes(payload[..8].try_into().unwrap()); let received = u64::from_le_bytes(payload[..8].try_into().unwrap());
assert_eq!(received, i, "message {i} arrived out of order"); assert_eq!(received, i, "message {i} arrived out of order");
} }
} }
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
async fn zmq_subscriber_connects_before_publisher_bind() {
let reserved_listener = reserve_open_port();
let endpoint = format!(
"tcp://127.0.0.1:{}",
reserved_listener
.local_addr()
.expect("failed to read reserved listener address")
.port()
);
drop(reserved_listener);
let sub_socket = connect_sub_socket(&endpoint).unwrap();
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
let pub_socket = bind_pub_socket(&endpoint).unwrap();
for _ in 0..5 {
send_multipart(&pub_socket, vec![b"probe".to_vec()])
.await
.unwrap();
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
}
let msg = tokio::time::timeout(
std::time::Duration::from_secs(5),
recv_multipart(&sub_socket),
)
.await
.expect("timed out waiting for ZMQ message")
.unwrap();
assert_eq!(msg, vec![b"probe".to_vec()]);
}
fn reserve_open_port() -> std::net::TcpListener {
std::net::TcpListener::bind("127.0.0.1:0").expect("failed to bind probe listener")
}
} }
...@@ -9,6 +9,7 @@ pub mod registry; ...@@ -9,6 +9,7 @@ pub mod registry;
#[cfg(feature = "indexer-runtime")] #[cfg(feature = "indexer-runtime")]
pub mod runtime; pub mod runtime;
pub mod server; pub mod server;
mod zmq;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
...@@ -38,10 +39,39 @@ pub struct RuntimeConfig { ...@@ -38,10 +39,39 @@ pub struct RuntimeConfig {
} }
pub(super) fn validate_zmq_endpoint(endpoint: &str) -> anyhow::Result<()> { pub(super) fn validate_zmq_endpoint(endpoint: &str) -> anyhow::Result<()> {
endpoint let (scheme, address) = endpoint
.parse::<zeromq::Endpoint>() .split_once("://")
.map(|_| ()) .ok_or_else(|| anyhow::anyhow!("invalid ZMQ endpoint `{endpoint}`: missing scheme"))?;
.map_err(|error| anyhow::anyhow!("invalid ZMQ endpoint `{endpoint}`: {error}"))
if address.is_empty() {
anyhow::bail!("invalid ZMQ endpoint `{endpoint}`: missing address");
}
match scheme {
"tcp" => {
let (host, port) = address.rsplit_once(':').ok_or_else(|| {
anyhow::anyhow!("invalid ZMQ endpoint `{endpoint}`: missing TCP port")
})?;
if host.is_empty() {
anyhow::bail!("invalid ZMQ endpoint `{endpoint}`: missing TCP host");
}
if host.starts_with('[') {
if !host.ends_with(']') {
anyhow::bail!("invalid ZMQ endpoint `{endpoint}`: missing closing `]`");
}
} else if host.contains(':') {
anyhow::bail!("invalid ZMQ endpoint `{endpoint}`: missing TCP port");
}
port.parse::<u16>().map_err(|error| {
anyhow::anyhow!("invalid ZMQ endpoint `{endpoint}`: invalid TCP port: {error}")
})?;
Ok(())
}
"ipc" | "inproc" => Ok(()),
other => Err(anyhow::anyhow!(
"invalid ZMQ endpoint `{endpoint}`: unsupported scheme `{other}`"
)),
}
} }
pub(super) fn validate_listener_endpoints( pub(super) fn validate_listener_endpoints(
...@@ -326,4 +356,20 @@ mod tests { ...@@ -326,4 +356,20 @@ mod tests {
let error = parse_workers("1").unwrap_err().to_string(); let error = parse_workers("1").unwrap_err().to_string();
assert!(error.contains("invalid worker entry")); assert!(error.contains("invalid worker entry"));
} }
#[test]
fn test_validate_zmq_endpoint_allows_wildcard_tcp_bind() {
validate_zmq_endpoint("tcp://*:5558").unwrap();
validate_zmq_endpoint("tcp://127.0.0.1:0").unwrap();
validate_zmq_endpoint("inproc://listener").unwrap();
validate_zmq_endpoint("ipc:///tmp/dynamo.sock").unwrap();
}
#[test]
fn test_validate_zmq_endpoint_rejects_invalid_values() {
assert!(validate_zmq_endpoint("tcp://host").is_err());
assert!(validate_zmq_endpoint("tcp://:5558").is_err());
assert!(validate_zmq_endpoint("udp://host:5558").is_err());
assert!(validate_zmq_endpoint("not-an-endpoint").is_err());
}
} }
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::VecDeque;
use std::future::poll_fn;
use std::os::unix::io::{AsRawFd, RawFd};
use std::sync::Arc;
use std::task::{Context, Poll, ready};
use anyhow::{Result, anyhow};
use tokio::{io::unix::AsyncFd, sync::Mutex};
pub(super) type MultipartMessage = Vec<Vec<u8>>;
pub(super) type SharedSocket = Arc<Mutex<ZmqSocket>>;
const ZMQ_RCVTIMEOUT_MS: i32 = 100;
const ZMQ_SNDTIMEOUT_MS: i32 = 0;
const ZMQ_RECONNECT_IVL_MS: i32 = 100;
const ZMQ_RECONNECT_IVL_MAX_MS: i32 = 5000;
const ZMQ_TCP_KEEPALIVE: i32 = 1;
const ZMQ_HEARTBEAT_IVL_MS: i32 = 5000;
const ZMQ_HEARTBEAT_TIMEOUT_MS: i32 = 15000;
const ZMQ_HEARTBEAT_TTL_MS: i32 = 15000;
const ZMQ_LINGER_MS: i32 = 0;
struct SocketWrapper {
socket: zmq::Socket,
fd: RawFd,
}
impl SocketWrapper {
fn new(socket: zmq::Socket) -> Result<Self> {
Ok(Self {
fd: socket.get_fd()?,
socket,
})
}
}
impl AsRawFd for SocketWrapper {
fn as_raw_fd(&self) -> RawFd {
self.fd
}
}
pub(super) struct ZmqSocket(AsyncFd<SocketWrapper>);
impl ZmqSocket {
fn new(socket: zmq::Socket) -> Result<Self> {
Ok(Self(AsyncFd::new(SocketWrapper::new(socket)?)?))
}
fn socket(&self) -> &zmq::Socket {
&self.0.get_ref().socket
}
fn poll_socket_event(
&mut self,
cx: &mut Context<'_>,
event: zmq::PollEvents,
) -> Poll<Result<()>> {
if self.socket().get_events()?.contains(event) {
Poll::Ready(Ok(()))
} else {
self.clear_read_ready(cx)?;
Poll::Pending
}
}
fn clear_read_ready(&mut self, cx: &mut Context<'_>) -> Result<()> {
if let Poll::Ready(mut guard) = self.0.poll_read_ready(cx)? {
guard.clear_ready();
cx.waker().wake_by_ref();
}
Ok(())
}
fn poll_recv_multipart(&mut self, cx: &mut Context<'_>) -> Poll<Result<MultipartMessage>> {
ready!(self.poll_socket_event(cx, zmq::POLLIN))?;
let mut frames = Vec::new();
loop {
let mut msg = zmq::Message::new();
match self.socket().recv(&mut msg, zmq::DONTWAIT) {
Ok(_) => {
let more = msg.get_more();
frames.push(msg.to_vec());
if !more {
return Poll::Ready(Ok(frames));
}
}
Err(zmq::Error::EAGAIN) if frames.is_empty() => {
self.clear_read_ready(cx)?;
return Poll::Pending;
}
Err(zmq::Error::EAGAIN) => {
return Poll::Ready(Err(anyhow!(
"multipart receive interrupted after {} frames",
frames.len()
)));
}
Err(error) => return Poll::Ready(Err(error.into())),
}
}
}
fn poll_send_multipart(
&mut self,
cx: &mut Context<'_>,
buffer: &mut VecDeque<zmq::Message>,
) -> Poll<Result<()>> {
while !buffer.is_empty() {
ready!(self.poll_socket_event(cx, zmq::POLLOUT))?;
while let Some(frame) = buffer.pop_front() {
let mut flags = zmq::DONTWAIT;
if !buffer.is_empty() {
flags |= zmq::SNDMORE;
}
match self.socket().send(&*frame, flags) {
Ok(_) => {}
Err(zmq::Error::EAGAIN) => {
buffer.push_front(frame);
self.clear_read_ready(cx)?;
return Poll::Pending;
}
Err(error) => return Poll::Ready(Err(error.into())),
}
}
}
Poll::Ready(Ok(()))
}
}
fn configure_common_socket(socket: &zmq::Socket) -> Result<()> {
socket.set_linger(ZMQ_LINGER_MS)?;
socket.set_reconnect_ivl(ZMQ_RECONNECT_IVL_MS)?;
socket.set_reconnect_ivl_max(ZMQ_RECONNECT_IVL_MAX_MS)?;
socket.set_tcp_keepalive(ZMQ_TCP_KEEPALIVE)?;
socket.set_heartbeat_ivl(ZMQ_HEARTBEAT_IVL_MS)?;
socket.set_heartbeat_timeout(ZMQ_HEARTBEAT_TIMEOUT_MS)?;
socket.set_heartbeat_ttl(ZMQ_HEARTBEAT_TTL_MS)?;
Ok(())
}
fn configure_receive_socket(socket: &zmq::Socket) -> Result<()> {
configure_common_socket(socket)?;
socket.set_rcvtimeo(ZMQ_RCVTIMEOUT_MS)?;
Ok(())
}
fn configure_bidirectional_socket(socket: &zmq::Socket) -> Result<()> {
configure_receive_socket(socket)?;
socket.set_sndtimeo(ZMQ_SNDTIMEOUT_MS)?;
Ok(())
}
#[cfg(test)]
fn configure_send_socket(socket: &zmq::Socket) -> Result<()> {
configure_common_socket(socket)?;
socket.set_sndtimeo(ZMQ_SNDTIMEOUT_MS)?;
Ok(())
}
fn build_socket<F>(socket_type: zmq::SocketType, configure: F) -> Result<ZmqSocket>
where
F: FnOnce(&zmq::Socket) -> Result<()>,
{
let context = zmq::Context::new();
let socket = context.socket(socket_type)?;
configure(&socket)?;
ZmqSocket::new(socket)
}
pub(super) fn connect_sub_socket(endpoint: &str) -> Result<SharedSocket> {
Ok(Arc::new(Mutex::new(build_socket(zmq::SUB, |socket| {
configure_receive_socket(socket)?;
socket.set_subscribe(b"")?;
socket.connect(endpoint)?;
Ok(())
})?)))
}
pub(super) fn connect_dealer_socket(endpoint: &str) -> Result<SharedSocket> {
Ok(Arc::new(Mutex::new(build_socket(zmq::DEALER, |socket| {
configure_bidirectional_socket(socket)?;
socket.connect(endpoint)?;
Ok(())
})?)))
}
#[cfg(test)]
pub(super) fn bind_pub_socket(endpoint: &str) -> Result<SharedSocket> {
Ok(Arc::new(Mutex::new(build_socket(zmq::PUB, |socket| {
configure_send_socket(socket)?;
socket.bind(endpoint)?;
Ok(())
})?)))
}
pub(super) async fn recv_multipart(socket: &SharedSocket) -> Result<MultipartMessage> {
let mut socket = socket.lock().await;
poll_fn(|cx| socket.poll_recv_multipart(cx)).await
}
pub(super) async fn send_multipart(socket: &SharedSocket, frames: MultipartMessage) -> Result<()> {
let mut socket = socket.lock().await;
let mut buffer = frames
.into_iter()
.map(zmq::Message::from)
.collect::<VecDeque<_>>();
poll_fn(|cx| socket.poll_send_multipart(cx, &mut buffer)).await
}
...@@ -167,7 +167,6 @@ tokio-rayon = {version = "2" } ...@@ -167,7 +167,6 @@ tokio-rayon = {version = "2" }
ndarray = { version = "0.16" } ndarray = { version = "0.16" }
# Publishers # Publishers
zeromq = "0.4.1"
rmp-serde = "1.3" rmp-serde = "1.3"
[dev-dependencies] [dev-dependencies]
......
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
use super::*; use super::*;
use super::zmq::*;
use utils::*; use utils::*;
use zmq::*;
use derive_builder::Builder; use derive_builder::Builder;
use std::sync::Arc; use std::sync::Arc;
......
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
use super::*; use super::*;
use super::zmq::*;
use futures::future::try_join_all; use futures::future::try_join_all;
use nixl_sys::NixlDescriptor; use nixl_sys::NixlDescriptor;
use utils::*; use utils::*;
use zmq::*;
use BlockTransferPool::*; use BlockTransferPool::*;
......
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
use super::*; use super::*;
use super::zmq::*;
use async_trait::async_trait; use async_trait::async_trait;
use transfer::*; use transfer::*;
use utils::*; use utils::*;
use zmq::*;
use crate::block_manager::{ use crate::block_manager::{
BasicMetadata, BlockMetadata, LayoutConfigBuilder, NixlLayout, Storage, BasicMetadata, BlockMetadata, LayoutConfigBuilder, NixlLayout, Storage,
......
...@@ -14,9 +14,9 @@ use std::sync::Arc; ...@@ -14,9 +14,9 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::atomic::{AtomicU64, Ordering};
use tokio::sync::RwLock; use tokio::sync::RwLock;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use zeromq::{PubSocket, Socket, SocketSend};
use super::tracker::{CacheStatusTracker, ConsolidatedEvent}; use super::tracker::{CacheStatusTracker, ConsolidatedEvent};
use crate::utils::zmq::{bind_pub_socket, send_multipart};
/// Event batch structure matching vLLM's format (array_like=True) /// Event batch structure matching vLLM's format (array_like=True)
/// Format: [timestamp, [events], data_parallel_rank] /// Format: [timestamp, [events], data_parallel_rank]
...@@ -183,9 +183,7 @@ impl KvEventConsolidatorPublisher { ...@@ -183,9 +183,7 @@ impl KvEventConsolidatorPublisher {
tracing::info!("Starting consolidated event publisher on {}", endpoint); tracing::info!("Starting consolidated event publisher on {}", endpoint);
// Create ZMQ PUB socket and bind // Create ZMQ PUB socket and bind
let mut socket = PubSocket::new(); let socket = bind_pub_socket(&endpoint)
socket
.bind(&endpoint)
.await .await
.with_context(|| format!("Failed to bind publisher to {}", endpoint))?; .with_context(|| format!("Failed to bind publisher to {}", endpoint))?;
...@@ -258,16 +256,9 @@ impl KvEventConsolidatorPublisher { ...@@ -258,16 +256,9 @@ impl KvEventConsolidatorPublisher {
Bytes::from(seq_bytes.to_vec()), Bytes::from(seq_bytes.to_vec()),
Bytes::from(payload), Bytes::from(payload),
]; ];
let frames = frames.into_iter().map(|frame| frame.to_vec()).collect();
let msg = match zeromq::ZmqMessage::try_from(frames) { if let Err(e) = send_multipart(&socket, frames).await {
Ok(m) => m,
Err(e) => {
tracing::error!("Failed to create multipart ZMQ message: {:?}", e);
continue;
}
};
if let Err(e) = socket.send(msg).await {
tracing::error!("Failed to send consolidated events: {}", e); tracing::error!("Failed to send consolidated events: {}", e);
} else { } else {
tracing::debug!( tracing::debug!(
......
...@@ -6,17 +6,18 @@ ...@@ -6,17 +6,18 @@
//! This is a simplified subscriber that deserializes raw vLLM/TensorRT-LLM events. //! This is a simplified subscriber that deserializes raw vLLM/TensorRT-LLM events.
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use futures::StreamExt;
use rmp_serde::Deserializer; use rmp_serde::Deserializer;
use serde::Deserialize; use serde::Deserialize;
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::RwLock; use tokio::sync::RwLock;
use tokio::task::JoinHandle; use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use zeromq::{Socket, SocketRecv, SubSocket};
use dynamo_kv_router::zmq_wire::RawKvEvent; use dynamo_kv_router::zmq_wire::RawKvEvent;
use super::tracker::{CacheStatusTracker, EventSource, StorageTier}; use super::tracker::{CacheStatusTracker, EventSource, StorageTier};
use crate::utils::zmq::{connect_sub_socket, multipart_message};
/// Event batch received from vLLM/TensorRT-LLM (array format) /// Event batch received from vLLM/TensorRT-LLM (array format)
/// Format: [timestamp, [events], data_parallel_rank] /// Format: [timestamp, [events], data_parallel_rank]
...@@ -73,15 +74,10 @@ async fn run_listener_loop( ...@@ -73,15 +74,10 @@ async fn run_listener_loop(
endpoint endpoint
); );
let mut socket = SubSocket::new(); let socket = connect_sub_socket(&endpoint, None)
socket
.connect(&endpoint)
.await .await
.context("Failed to connect to ZMQ endpoint")?; .with_context(|| format!("Failed to connect to ZMQ endpoint {endpoint}"))?;
socket let mut socket = socket;
.subscribe("")
.await
.context("Failed to subscribe to ZMQ topics")?;
tracing::info!( tracing::info!(
"KV event consolidator ZMQ listener successfully connected to {}", "KV event consolidator ZMQ listener successfully connected to {}",
...@@ -97,18 +93,19 @@ async fn run_listener_loop( ...@@ -97,18 +93,19 @@ async fn run_listener_loop(
break; break;
} }
msg_result = socket.recv() => { msg_result = socket.next() => {
let Ok(msg) = msg_result else { let frames = match msg_result {
tracing::warn!("Error receiving ZMQ message: {:?}", msg_result.unwrap_err()); Some(Ok(frames)) => multipart_message(frames),
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; Some(Err(error)) => {
continue; tracing::error!("Error receiving ZMQ message: {error}");
break;
}
None => break,
}; };
// Parse multipart message: supports both formats // Parse multipart message: supports both formats
// - 2 frames: [topic, payload] // - 2 frames: [topic, payload]
// - 3 frames: [topic, sequence, payload] // - 3 frames: [topic, sequence, payload]
let frames: Vec<Vec<u8>> = msg.into_vec().into_iter().map(|f| f.to_vec()).collect();
let payload = match frames.len() { let payload = match frames.len() {
2 => &frames[1], // [topic, payload] 2 => &frames[1], // [topic, payload]
3 => &frames[2], // [topic, sequence, payload] 3 => &frames[2], // [topic, sequence, payload]
......
...@@ -11,19 +11,17 @@ ...@@ -11,19 +11,17 @@
//! [`crate::kv_router::publisher::KvEventPublisher`], but is much simpler: //! [`crate::kv_router::publisher::KvEventPublisher`], but is much simpler:
//! no event transformation, no batching, no local indexer — just raw byte relay. //! no event transformation, no batching, no local indexer — just raw byte relay.
use std::time::Duration;
use anyhow::Result; use anyhow::Result;
use futures::StreamExt;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use zeromq::SocketRecv;
use crate::utils::zmq::connect_sub_socket_with_retry;
use dynamo_runtime::component::Component; use dynamo_runtime::component::Component;
use dynamo_runtime::traits::DistributedRuntimeProvider; use dynamo_runtime::traits::DistributedRuntimeProvider;
use dynamo_runtime::transports::event_plane::EventPublisher; use dynamo_runtime::transports::event_plane::EventPublisher;
use crate::utils::zmq::{connect_sub_socket, multipart_message};
const FPM_TOPIC: &str = "forward-pass-metrics"; const FPM_TOPIC: &str = "forward-pass-metrics";
const MAX_CONSECUTIVE_ERRORS: u32 = 10;
/// A relay that bridges ForwardPassMetrics from a local raw ZMQ PUB socket /// A relay that bridges ForwardPassMetrics from a local raw ZMQ PUB socket
/// to the Dynamo event plane. /// to the Dynamo event plane.
...@@ -62,15 +60,16 @@ impl FpmEventRelay { ...@@ -62,15 +60,16 @@ impl FpmEventRelay {
publisher: EventPublisher, publisher: EventPublisher,
cancel: CancellationToken, cancel: CancellationToken,
) { ) {
let Some(mut socket) = let socket = match connect_sub_socket(&zmq_endpoint, None).await {
connect_sub_socket_with_retry(&zmq_endpoint, None, &cancel, "FPM relay").await Ok(socket) => socket,
else { Err(error) => {
return; tracing::error!(endpoint = %zmq_endpoint, error = %error, "FPM relay: failed to connect");
return;
}
}; };
let mut socket = socket;
tracing::info!("FPM relay: connected to {zmq_endpoint}"); tracing::info!("FPM relay: connected to {zmq_endpoint}");
let mut consecutive_errors: u32 = 0;
loop { loop {
tokio::select! { tokio::select! {
biased; biased;
...@@ -78,16 +77,11 @@ impl FpmEventRelay { ...@@ -78,16 +77,11 @@ impl FpmEventRelay {
tracing::info!("FPM relay: shutting down"); tracing::info!("FPM relay: shutting down");
break; break;
} }
result = socket.recv() => { result = socket.next() => {
match result { match result {
Ok(msg) => { Some(Ok(frames)) => {
consecutive_errors = 0; let mut frames = multipart_message(frames);
// ZMQ multipart: [topic, seq, payload] // ZMQ multipart: [topic, seq, payload]
let mut frames: Vec<Vec<u8>> = msg
.into_vec()
.into_iter()
.map(|f| f.to_vec())
.collect();
if frames.len() == 3 { if frames.len() == 3 {
let payload = frames.swap_remove(2); let payload = frames.swap_remove(2);
if let Err(e) = publisher.publish_bytes(payload).await { if let Err(e) = publisher.publish_bytes(payload).await {
...@@ -97,19 +91,16 @@ impl FpmEventRelay { ...@@ -97,19 +91,16 @@ impl FpmEventRelay {
tracing::warn!( tracing::warn!(
"FPM relay: unexpected ZMQ frame count: expected 3, got {}", "FPM relay: unexpected ZMQ frame count: expected 3, got {}",
frames.len() frames.len()
);
}
}
Err(e) => {
consecutive_errors += 1;
tracing::warn!(
"FPM relay: ZMQ recv error ({consecutive_errors}/{MAX_CONSECUTIVE_ERRORS}): {e}"
); );
if consecutive_errors >= MAX_CONSECUTIVE_ERRORS {
tracing::error!("FPM relay: too many consecutive errors, exiting");
break;
} }
tokio::time::sleep(Duration::from_millis(100)).await; }
Some(Err(e)) => {
tracing::error!("FPM relay: ZMQ recv failed: {e}");
break;
}
None => {
tracing::error!("FPM relay: ZMQ stream ended");
break;
} }
} }
} }
......
...@@ -40,11 +40,6 @@ use event_processor::{ ...@@ -40,11 +40,6 @@ use event_processor::{
}; };
pub use worker_metrics::WorkerMetricsPublisher; pub use worker_metrics::WorkerMetricsPublisher;
use zmq_listener::start_zmq_listener; use zmq_listener::start_zmq_listener;
#[cfg(test)]
use zmq_listener::{
INITIAL_BACKOFF_MS, MAX_BACKOFF_EXPONENT, MAX_BACKOFF_MS, MAX_CONSECUTIVE_ERRORS,
calculate_backoff_ms,
};
const MAX_BATCHING_TIMEOUT_MS: u64 = 15_000; const MAX_BATCHING_TIMEOUT_MS: u64 = 15_000;
pub const DEFAULT_BATCHING_TIMEOUT_MS: Option<u64> = None; pub const DEFAULT_BATCHING_TIMEOUT_MS: Option<u64> = None;
......
...@@ -13,8 +13,6 @@ use std::future::Future; ...@@ -13,8 +13,6 @@ use std::future::Future;
#[allow(unused_imports)] #[allow(unused_imports)]
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::time::Duration; use std::time::Duration;
#[allow(unused_imports)]
use zeromq::{PubSocket, Socket, SocketSend, ZmqMessage};
#[cfg(test)] #[cfg(test)]
mod test_event_processing { mod test_event_processing {
...@@ -430,11 +428,11 @@ mod test_event_processing { ...@@ -430,11 +428,11 @@ mod test_event_processing {
#[cfg(test)] #[cfg(test)]
mod tests_startup_helpers { mod tests_startup_helpers {
use super::*; use super::*;
use crate::utils::zmq::{bind_pub_socket, send_multipart};
use bytes::Bytes; use bytes::Bytes;
use dynamo_kv_router::indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface}; use dynamo_kv_router::indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface};
use dynamo_kv_router::protocols::{ExternalSequenceBlockHash, LocalBlockHash}; use dynamo_kv_router::protocols::{ExternalSequenceBlockHash, LocalBlockHash};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use zeromq::{PubSocket, Socket, SocketSend, ZmqMessage};
// Type alias to resolve clippy::type_complexity warning // Type alias to resolve clippy::type_complexity warning
type PublishedEvents = Arc<Mutex<Vec<(String, Vec<u8>)>>>; type PublishedEvents = Arc<Mutex<Vec<(String, Vec<u8>)>>>;
...@@ -828,13 +826,20 @@ mod tests_startup_helpers { ...@@ -828,13 +826,20 @@ mod tests_startup_helpers {
// Prepare channel that listener should fill // Prepare channel that listener should fill
let (tx, mut rx) = mpsc::unbounded_channel::<PlacementEvent>(); let (tx, mut rx) = mpsc::unbounded_channel::<PlacementEvent>();
// ZMQ TCP endpoint using localhost with fixed port // ZMQ TCP endpoint using localhost with an ephemeral port
let endpoint = "tcp://127.0.0.1:15555"; let reserved_listener = reserve_open_port();
let endpoint = format!(
"tcp://127.0.0.1:{}",
reserved_listener
.local_addr()
.expect("failed to read reserved listener address")
.port()
);
drop(reserved_listener);
let topic = "".to_string(); // subscribe to all let topic = "".to_string(); // subscribe to all
// Publisher side - set up first // Publisher side - set up first
let mut pub_socket = PubSocket::new(); let pub_socket = bind_pub_socket(&endpoint).await.unwrap();
pub_socket.bind(endpoint).await.unwrap();
// Cancellation token so we can stop the listener // Cancellation token so we can stop the listener
let token = dynamo_runtime::CancellationToken::new(); let token = dynamo_runtime::CancellationToken::new();
...@@ -873,16 +878,13 @@ mod tests_startup_helpers { ...@@ -873,16 +878,13 @@ mod tests_startup_helpers {
let payload = Bytes::from(rmps::to_vec(&batch).unwrap()); let payload = Bytes::from(rmps::to_vec(&batch).unwrap());
let frames = vec![ let frames = vec![
Bytes::from(""), Bytes::from("").to_vec(),
Bytes::from(seq.to_be_bytes().to_vec()), Bytes::from(seq.to_be_bytes().to_vec()).to_vec(),
payload.clone(), payload.clone().to_vec(),
]; ];
// Create a proper multipart message
let msg = ZmqMessage::try_from(frames).expect("Failed to create ZmqMessage");
// Send the multipart message // Send the multipart message
pub_socket.send(msg).await.unwrap(); send_multipart(&pub_socket, frames).await.unwrap();
// Wait for message to be received // Wait for message to be received
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
...@@ -907,6 +909,75 @@ mod tests_startup_helpers { ...@@ -907,6 +909,75 @@ mod tests_startup_helpers {
let _ = listener_handle.await; let _ = listener_handle.await;
} }
#[tokio::test]
async fn test_start_zmq_listener_connects_before_publisher_bind() {
let (tx, mut rx) = mpsc::unbounded_channel::<PlacementEvent>();
let reserved_listener = reserve_open_port();
let endpoint = format!(
"tcp://127.0.0.1:{}",
reserved_listener
.local_addr()
.expect("failed to read reserved listener address")
.port()
);
drop(reserved_listener);
let topic = String::new();
let token = dynamo_runtime::CancellationToken::new();
let next_event_id = Arc::new(AtomicU64::new(0));
let listener_handle = tokio::spawn({
let token = token.clone();
let endpoint = endpoint.clone();
start_zmq_listener(endpoint, topic, 1, tx, token, 4, next_event_id)
});
tokio::time::sleep(tokio::time::Duration::from_millis(150)).await;
let pub_socket = bind_pub_socket(&endpoint).await.unwrap();
let batch = KvEventBatch {
ts: 0.0,
events: vec![RawKvEvent::BlockStored {
block_hashes: vec![BlockHashValue::Unsigned(64)],
parent_block_hash: None,
token_ids: vec![4, 5, 6, 7],
block_size: 4,
medium: None,
lora_name: None,
block_mm_infos: None,
is_eagle: None,
}],
data_parallel_rank: Some(0),
};
let payload = rmps::to_vec(&batch).unwrap();
for _ in 0..5 {
send_multipart(
&pub_socket,
vec![Vec::new(), 12u64.to_be_bytes().to_vec(), payload.clone()],
)
.await
.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
}
let event = tokio::time::timeout(tokio::time::Duration::from_secs(5), rx.recv())
.await
.expect("timed out waiting for listener event")
.expect("listener channel closed")
.event;
let KvCacheEventData::Stored(KvCacheStoreData { blocks, .. }) = event.data else {
panic!("expected KvCacheStoreData");
};
assert_eq!(blocks[0].block_hash.0, 64);
token.cancel();
let _ = listener_handle.await;
}
fn reserve_open_port() -> std::net::TcpListener {
std::net::TcpListener::bind("127.0.0.1:0").expect("failed to bind probe listener")
}
//-------------------------------------------------------------------- //--------------------------------------------------------------------
// Test distributed recovery: Router queries worker's LocalKvIndexer after outage // Test distributed recovery: Router queries worker's LocalKvIndexer after outage
//-------------------------------------------------------------------- //--------------------------------------------------------------------
...@@ -1120,55 +1191,6 @@ mod tests_startup_helpers { ...@@ -1120,55 +1191,6 @@ mod tests_startup_helpers {
} }
} }
#[cfg(test)]
mod test_exponential_backoff {
use super::*;
#[test]
fn test_backoff_calculation_progression() {
// Test the exponential progression
assert_eq!(calculate_backoff_ms(0), 10); // 10 * 2^0 = 10
assert_eq!(calculate_backoff_ms(1), 20); // 10 * 2^1 = 20
assert_eq!(calculate_backoff_ms(2), 40); // 10 * 2^2 = 40
assert_eq!(calculate_backoff_ms(3), 80); // 10 * 2^3 = 80
assert_eq!(calculate_backoff_ms(4), 160); // 10 * 2^4 = 160
assert_eq!(calculate_backoff_ms(5), 320); // 10 * 2^5 = 320
assert_eq!(calculate_backoff_ms(6), 640); // 10 * 2^6 = 640
assert_eq!(calculate_backoff_ms(7), 1280); // 10 * 2^7 = 1280
assert_eq!(calculate_backoff_ms(8), 2560); // 10 * 2^8 = 2560
}
#[test]
fn test_backoff_caps_at_max_exponent() {
// After MAX_BACKOFF_EXPONENT, should stay at 2^8 = 2560ms
assert_eq!(calculate_backoff_ms(8), 2560);
assert_eq!(calculate_backoff_ms(9), 2560); // Same as 8
assert_eq!(calculate_backoff_ms(100), 2560); // Same as 8
}
#[test]
fn test_backoff_never_exceeds_max() {
// Even if we somehow had a huge exponent, never exceed MAX_BACKOFF_MS
for i in 0..20 {
assert!(calculate_backoff_ms(i) <= MAX_BACKOFF_MS);
}
}
#[test]
#[expect(clippy::assertions_on_constants)]
fn test_backoff_constants_are_sane() {
// Verify our constants make sense together
assert!(INITIAL_BACKOFF_MS > 0);
assert!(MAX_BACKOFF_MS > INITIAL_BACKOFF_MS);
assert!(MAX_BACKOFF_EXPONENT <= 10); // Prevent crazy exponents
assert!(MAX_CONSECUTIVE_ERRORS > 0);
// Max calculated value should be less than MAX_BACKOFF_MS
let max_calculated = INITIAL_BACKOFF_MS * 2_u64.pow(MAX_BACKOFF_EXPONENT);
assert!(max_calculated <= MAX_BACKOFF_MS);
}
}
#[cfg(all(test, feature = "integration"))] #[cfg(all(test, feature = "integration"))]
mod test_integration_publisher { mod test_integration_publisher {
use super::*; use super::*;
......
...@@ -3,28 +3,16 @@ ...@@ -3,28 +3,16 @@
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::time::Duration;
use futures::StreamExt;
use rmp_serde as rmps; use rmp_serde as rmps;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use zeromq::SocketRecv;
use crate::utils::zmq::connect_sub_socket_with_retry;
use dynamo_kv_router::protocols::*; use dynamo_kv_router::protocols::*;
use dynamo_kv_router::zmq_wire::*; use dynamo_kv_router::zmq_wire::*;
pub(super) const INITIAL_BACKOFF_MS: u64 = 10; use crate::utils::zmq::{connect_sub_socket, multipart_message};
pub(super) const MAX_BACKOFF_MS: u64 = 5000;
pub(super) const MAX_CONSECUTIVE_ERRORS: u32 = 10;
pub(super) const MAX_BACKOFF_EXPONENT: u32 = 8;
pub(super) fn calculate_backoff_ms(consecutive_errors: u32) -> u64 {
std::cmp::min(
INITIAL_BACKOFF_MS * 2_u64.pow(consecutive_errors.min(MAX_BACKOFF_EXPONENT)),
MAX_BACKOFF_MS,
)
}
pub(super) async fn start_zmq_listener( pub(super) async fn start_zmq_listener(
zmq_endpoint: String, zmq_endpoint: String,
...@@ -42,63 +30,40 @@ pub(super) async fn start_zmq_listener( ...@@ -42,63 +30,40 @@ pub(super) async fn start_zmq_listener(
); );
let warning_count = Arc::new(AtomicU32::new(0)); let warning_count = Arc::new(AtomicU32::new(0));
let Some(mut socket) = connect_sub_socket_with_retry( let socket = match connect_sub_socket(&zmq_endpoint, Some(&zmq_topic)).await {
&zmq_endpoint, Ok(socket) => socket,
Some(&zmq_topic), Err(error) => {
&cancellation_token, tracing::error!(endpoint = %zmq_endpoint, topic = %zmq_topic, error = %error, "ZMQ listener failed to connect");
"ZMQ listener", return;
) }
.await
else {
return;
}; };
let mut socket = socket;
if cancellation_token.is_cancelled() {
return;
}
let mut consecutive_errors = 0u32;
#[expect(unused_assignments)]
let mut exit_reason = "unknown";
let mut messages_processed = 0u64; let mut messages_processed = 0u64;
'main: loop { let exit_reason = 'main: loop {
tokio::select! { tokio::select! {
biased; biased;
_ = cancellation_token.cancelled() => { _ = cancellation_token.cancelled() => {
tracing::debug!("ZMQ listener received cancellation signal"); tracing::debug!("ZMQ listener received cancellation signal");
exit_reason = "cancellation token cancelled"; break 'main String::from("cancellation token cancelled");
break 'main;
} }
msg_result = socket.recv() => { msg_result = socket.next() => {
let Ok(msg) = msg_result else { let frames = match msg_result {
let e = msg_result.unwrap_err(); Some(Ok(frames)) => multipart_message(frames),
consecutive_errors += 1; Some(Err(error)) => {
tracing::error!(endpoint = %zmq_endpoint, error = %error, "ZMQ listener recv failed");
if consecutive_errors >= MAX_CONSECUTIVE_ERRORS { break 'main format!("ZMQ recv failed: {error}");
tracing::error!(
error=%e,
consecutive_errors=%consecutive_errors,
"Too many consecutive ZMQ errors, terminating listener"
);
exit_reason = "too many consecutive errors";
break 'main;
} }
None => break 'main String::from("ZMQ stream ended"),
let backoff_ms = calculate_backoff_ms(consecutive_errors);
tracing::warn!(
error=%e,
consecutive_errors=%consecutive_errors,
backoff_ms=%backoff_ms,
"Error reading from ZMQ socket, applying exponential backoff"
);
tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
continue;
}; };
consecutive_errors = 0; let mut frames = frames;
let mut frames: Vec<Vec<u8>> =
msg.into_vec().into_iter().map(|frame| frame.to_vec()).collect();
if frames.len() != 3 { if frames.len() != 3 {
tracing::warn!( tracing::warn!(
...@@ -144,14 +109,13 @@ pub(super) async fn start_zmq_listener( ...@@ -144,14 +109,13 @@ pub(super) async fn start_zmq_listener(
convert_event(raw_event, event_id, kv_block_size, worker, &warning_count); convert_event(raw_event, event_id, kv_block_size, worker, &warning_count);
if tx.send(event).is_err() { if tx.send(event).is_err() {
tracing::warn!("Failed to send message to channel - receiver dropped"); tracing::warn!("Failed to send message to channel - receiver dropped");
exit_reason = "channel receiver dropped"; break 'main String::from("channel receiver dropped");
break 'main;
} }
messages_processed += 1; messages_processed += 1;
} }
} }
} }
} };
tracing::debug!( tracing::debug!(
"ZMQ listener exiting, reason: {}, messages processed: {}", "ZMQ listener exiting, reason: {}, messages processed: {}",
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment