"deploy/operator/config/rbac/role.yaml" did not exist on "9e6972a548c44e78361ca1296d36f862bbe4dbae"
Unverified Commit 4d3269cd authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: discovery traits, systems and managers (#4070)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent 1a27649c
......@@ -27,6 +27,41 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "aead"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"
dependencies = [
"crypto-common",
"generic-array",
]
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if 1.0.4",
"cipher",
"cpufeatures",
]
[[package]]
name = "aes-gcm"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"
dependencies = [
"aead",
"aes",
"cipher",
"ctr",
"ghash",
"subtle",
]
[[package]]
name = "ahash"
version = "0.8.12"
......@@ -56,6 +91,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1763692fc1416554cf051efc56a3de5595eca47299d731cc5c2b583adf8b4d2f"
[[package]]
name = "aligned"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "377e4c0ba83e4431b10df45c1d4666f178ea9c552cac93e60c3a88bf32785923"
dependencies = [
"as-slice",
]
[[package]]
name = "aligned-vec"
version = "0.6.4"
......@@ -118,22 +162,22 @@ dependencies = [
[[package]]
name = "anstyle-query"
version = "1.1.4"
version = "1.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2"
checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
dependencies = [
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.10"
version = "3.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a"
checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
dependencies = [
"anstyle",
"once_cell_polyfill",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
......@@ -210,6 +254,54 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0f477b951e452a0b6b4a10b53ccd569042d1d01729b519e02074a9c0958a063"
[[package]]
name = "as-slice"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "asn1-rs"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56624a96882bb8c26d61312ae18cb45868e5a9992ea73c58e45c3101e56a1e60"
dependencies = [
"asn1-rs-derive",
"asn1-rs-impl",
"displaydoc",
"nom 7.1.3",
"num-traits",
"rusticata-macros",
"thiserror 2.0.17",
"time",
]
[[package]]
name = "asn1-rs-derive"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3109e49b1e4909e9db6515a30c633684d68cdeaa252f215214cb4fa1a5bfee2c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.110",
"synstructure",
]
[[package]]
name = "asn1-rs-impl"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.110",
]
[[package]]
name = "assert-json-diff"
version = "2.0.2"
......@@ -226,6 +318,22 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
[[package]]
name = "astral-tokio-tar"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec179a06c1769b1e42e1e2cbe74c7dcdb3d6383c838454d063eaac5bbb7ebbe5"
dependencies = [
"filetime",
"futures-core",
"libc",
"portable-atomic",
"rustc-hash 2.1.1",
"tokio",
"tokio-stream",
"xattr",
]
[[package]]
name = "async-broadcast"
version = "0.7.2"
......@@ -238,15 +346,33 @@ dependencies = [
"pin-project-lite",
]
[[package]]
name = "async-io"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc"
dependencies = [
"autocfg",
"cfg-if 1.0.4",
"concurrent-queue",
"futures-io",
"futures-lite",
"parking",
"polling",
"rustix",
"slab",
"windows-sys 0.61.2",
]
[[package]]
name = "async-nats"
version = "0.40.0"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e23419d455dc57d3ae60a2f4278cf561fc74fe866e548e14d2b0ad3e1b8ca0b2"
checksum = "86dde77d8a733a9dbaf865a9eb65c72e09c88f3d14d3dd0d2aecf511920ee4fe"
dependencies = [
"base64 0.22.1",
"bytes",
"futures",
"futures-util",
"memchr",
"nkeys",
"nuid",
......@@ -267,6 +393,7 @@ dependencies = [
"time",
"tokio",
"tokio-rustls",
"tokio-stream",
"tokio-util",
"tokio-websockets",
"tracing",
......@@ -366,6 +493,18 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "attohttpc"
version = "0.30.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16e2cdb6d5ed835199484bb92bb8b3edd526effe995c61732580439c1a67e2e9"
dependencies = [
"base64 0.22.1",
"http 1.3.1",
"log",
"url",
]
[[package]]
name = "atty"
version = "0.2.14"
......@@ -383,6 +522,26 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "av-scenechange"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394"
dependencies = [
"aligned",
"anyhow",
"arg_enum_proc_macro",
"arrayvec",
"log",
"num-rational",
"num-traits",
"pastey",
"rayon",
"thiserror 2.0.17",
"v_frame",
"y4m",
]
[[package]]
name = "av1-grain"
version = "0.2.5"
......@@ -470,7 +629,7 @@ dependencies = [
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"itoa",
"matchit 0.8.4",
......@@ -543,16 +702,16 @@ dependencies = [
[[package]]
name = "axum-server"
version = "0.7.2"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "495c05f60d6df0093e8fb6e74aa5846a0ad06abaf96d76166283720bf740f8ab"
checksum = "c1ab4a3ec9ea8a657c72d99a03a824af695bd0fb5ec639ccbd9cd3543b41a5f9"
dependencies = [
"arc-swap",
"bytes",
"fs-err",
"http 1.3.1",
"http-body 1.0.1",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"pin-project-lite",
"rustls",
......@@ -603,6 +762,22 @@ dependencies = [
"windows-link 0.2.1",
]
[[package]]
name = "base-x"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270"
[[package]]
name = "base256emoji"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e9430d9a245a77c92176e649af6e275f20839a48389859d1661e9a128d077c"
dependencies = [
"const-str",
"match-lookup",
]
[[package]]
name = "base64"
version = "0.13.1"
......@@ -770,9 +945,21 @@ dependencies = [
[[package]]
name = "bitstream-io"
version = "2.6.0"
version = "4.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60d4bd9d1db2c6bdf285e223a7fa369d5ce98ec767dec949c6ca62863ce61757"
dependencies = [
"core2",
]
[[package]]
name = "blake2"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe"
dependencies = [
"digest",
]
[[package]]
name = "blake3"
......@@ -804,6 +991,15 @@ dependencies = [
"generic-array",
]
[[package]]
name = "block2"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5"
dependencies = [
"objc2",
]
[[package]]
name = "bm25"
version = "2.3.2"
......@@ -818,6 +1014,92 @@ dependencies = [
"unicode-segmentation",
]
[[package]]
name = "bollard"
version = "0.19.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87a52479c9237eb04047ddb94788c41ca0d26eaff8b697ecfbb4c32f7fdc3b1b"
dependencies = [
"async-stream",
"base64 0.22.1",
"bitflags 2.10.0",
"bollard-buildkit-proto",
"bollard-stubs",
"bytes",
"chrono",
"futures-core",
"futures-util",
"hex",
"home",
"http 1.3.1",
"http-body-util",
"hyper 1.8.1",
"hyper-named-pipe",
"hyper-rustls",
"hyper-util",
"hyperlocal",
"log",
"num",
"pin-project-lite",
"rand 0.9.2",
"rustls",
"rustls-native-certs 0.8.2",
"rustls-pemfile",
"rustls-pki-types",
"serde",
"serde_derive",
"serde_json",
"serde_repr",
"serde_urlencoded",
"thiserror 2.0.17",
"tokio",
"tokio-stream",
"tokio-util",
"tonic 0.14.2",
"tower-service",
"url",
"winapi 0.3.9",
]
[[package]]
name = "bollard-buildkit-proto"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad"
dependencies = [
"prost 0.14.1",
"prost-types 0.14.1",
"tonic 0.14.2",
"tonic-prost",
"ureq 3.1.4",
]
[[package]]
name = "bollard-stubs"
version = "1.49.1-rc.28.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5731fe885755e92beff1950774068e0cae67ea6ec7587381536fca84f1779623"
dependencies = [
"base64 0.22.1",
"bollard-buildkit-proto",
"bytes",
"chrono",
"prost 0.14.1",
"serde",
"serde_json",
"serde_repr",
"serde_with",
]
[[package]]
name = "bs58"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf88ba1141d185c399bee5288d850d63b8369520c1eafc32a0430b5b6c287bf4"
dependencies = [
"tinyvec",
]
[[package]]
name = "bs62"
version = "0.1.4"
......@@ -842,9 +1124,9 @@ dependencies = [
[[package]]
name = "built"
version = "0.7.7"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56ed6191a7e78c36abdb16ab65341eefd73d64d303fffccdbb00d51e4205967b"
checksum = "f4ad8f11f288f48ca24471bbd51ac257aaeaaa07adae295591266b792902ae64"
[[package]]
name = "bumpalo"
......@@ -892,9 +1174,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
[[package]]
name = "bytes"
version = "1.10.1"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
dependencies = [
"serde",
]
......@@ -1039,7 +1321,7 @@ version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb"
dependencies = [
"clap 4.5.51",
"clap 4.5.52",
"heck 0.4.1",
"indexmap 2.12.0",
"log",
......@@ -1054,9 +1336,9 @@ dependencies = [
[[package]]
name = "cc"
version = "1.2.45"
version = "1.2.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35900b6c8d709fb1d854671ae27aeaa9eec2f8b01b364e1619a40da3e6fe2afe"
checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36"
dependencies = [
"find-msvc-tools",
"jobserver",
......@@ -1115,6 +1397,30 @@ dependencies = [
"vob",
]
[[package]]
name = "chacha20"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818"
dependencies = [
"cfg-if 1.0.4",
"cipher",
"cpufeatures",
]
[[package]]
name = "chacha20poly1305"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35"
dependencies = [
"aead",
"chacha20",
"cipher",
"poly1305",
"zeroize",
]
[[package]]
name = "chrono"
version = "0.4.42"
......@@ -1156,6 +1462,17 @@ dependencies = [
"half 2.7.1",
]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
"zeroize",
]
[[package]]
name = "clang-sys"
version = "1.8.1"
......@@ -1180,9 +1497,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.5.51"
version = "4.5.52"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5"
checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8"
dependencies = [
"clap_builder",
"clap_derive",
......@@ -1190,9 +1507,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.51"
version = "4.5.52"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a"
checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1"
dependencies = [
"anstream",
"anstyle",
......@@ -1314,7 +1631,7 @@ checksum = "8030735ecb0d128428b64cd379809817e620a40e5001c54465b99ec5feec2857"
dependencies = [
"futures-core",
"prost 0.13.5",
"prost-types",
"prost-types 0.13.5",
"tonic 0.12.3",
"tracing-core",
]
......@@ -1333,7 +1650,7 @@ dependencies = [
"humantime",
"hyper-util",
"prost 0.13.5",
"prost-types",
"prost-types 0.13.5",
"serde",
"serde_json",
"thread_local",
......@@ -1371,6 +1688,12 @@ dependencies = [
"tiny-keccak",
]
[[package]]
name = "const-str"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f421161cb492475f1661ddc9815a745a1c894592070661180fdec3d4872e9c3"
[[package]]
name = "constant_time_eq"
version = "0.3.1"
......@@ -1424,8 +1747,17 @@ dependencies = [
]
[[package]]
name = "cpufeatures"
version = "0.2.17"
name = "core2"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505"
dependencies = [
"memchr",
]
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
......@@ -1476,7 +1808,7 @@ dependencies = [
"anes",
"cast",
"ciborium",
"clap 4.5.51",
"clap 4.5.52",
"criterion-plot 0.5.0",
"futures",
"is-terminal",
......@@ -1515,6 +1847,12 @@ dependencies = [
"itertools 0.10.5",
]
[[package]]
name = "critical-section"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
[[package]]
name = "crossbeam"
version = "0.8.4"
......@@ -1609,6 +1947,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
dependencies = [
"generic-array",
"rand_core 0.6.4",
"typenum",
]
......@@ -1656,6 +1995,15 @@ dependencies = [
"memchr",
]
[[package]]
name = "ctr"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"
dependencies = [
"cipher",
]
[[package]]
name = "cudarc"
version = "0.16.6"
......@@ -1689,6 +2037,7 @@ dependencies = [
"fiat-crypto",
"rustc_version",
"subtle",
"zeroize",
]
[[package]]
......@@ -1849,6 +2198,26 @@ version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476"
[[package]]
name = "data-encoding-macro"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47ce6c96ea0102f01122a185683611bd5ac8d99e62bc59dd12e6bda344ee673d"
dependencies = [
"data-encoding",
"data-encoding-macro-internal",
]
[[package]]
name = "data-encoding-macro-internal"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d162beedaa69905488a8da94f5ac3edb4dd4788b732fadb7bd120b2625c1976"
dependencies = [
"data-encoding",
"syn 2.0.110",
]
[[package]]
name = "defmac"
version = "0.1.3"
......@@ -1866,6 +2235,20 @@ dependencies = [
"zeroize",
]
[[package]]
name = "der-parser"
version = "10.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07da5016415d5a3c4dd39b11ed26f915f52fc4e0dc197d87908bc916e51bc1a6"
dependencies = [
"asn1-rs",
"displaydoc",
"nom 7.1.3",
"num-bigint",
"num-traits",
"rusticata-macros",
]
[[package]]
name = "deranged"
version = "0.5.5"
......@@ -2032,6 +2415,7 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
]
[[package]]
......@@ -2066,6 +2450,16 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "dispatch2"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec"
dependencies = [
"bitflags 2.10.0",
"objc2",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
......@@ -2086,6 +2480,17 @@ dependencies = [
"const-random",
]
[[package]]
name = "docker_credential"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d89dfcba45b4afad7450a99b39e751590463e45c04728cf555d36bb66940de8"
dependencies = [
"base64 0.21.7",
"serde",
"serde_json",
]
[[package]]
name = "doctest-file"
version = "1.0.0"
......@@ -2190,6 +2595,48 @@ dependencies = [
"anyhow",
]
[[package]]
name = "dynamo-discovery"
version = "0.7.0"
dependencies = [
"anyhow",
"async-trait",
"axum 0.8.4",
"blake2",
"bytes",
"clap 4.5.52",
"dashmap 6.1.0",
"derive_builder",
"etcd-client",
"figment",
"futures",
"futures-util",
"hyper 1.8.1",
"libp2p",
"libp2p-identity",
"libp2p-kad",
"libp2p-mdns",
"libp2p-swarm",
"parking_lot",
"reqwest 0.12.24",
"serde",
"serde_json",
"tempfile",
"testcontainers",
"thiserror 2.0.17",
"tokio",
"tokio-stream",
"tokio-util",
"tonic 0.14.2",
"tower 0.5.2",
"tower-http",
"tracing",
"tracing-subscriber",
"uuid 1.18.1",
"validator",
"xxhash-rust",
]
[[package]]
name = "dynamo-engine-mistralrs"
version = "0.7.0"
......@@ -2262,7 +2709,7 @@ dependencies = [
"galil-seiferas",
"hf-hub",
"humantime",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"image",
"insta",
......@@ -2307,7 +2754,7 @@ dependencies = [
"toktrie 1.3.0",
"toktrie_hf_tokenizers 1.3.0",
"tonic 0.13.1",
"tonic-build",
"tonic-build 0.13.1",
"tower 0.5.2",
"tower-http",
"tracing",
......@@ -2363,7 +2810,7 @@ dependencies = [
"anyhow",
"async-stream",
"async-trait",
"clap 4.5.51",
"clap 4.5.52",
"dynamo-async-openai",
"dynamo-engine-mistralrs",
"dynamo-llm",
......@@ -2413,7 +2860,7 @@ dependencies = [
"figment",
"futures",
"humantime",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"inotify 0.11.0",
"jsonschema",
......@@ -2478,6 +2925,7 @@ version = "2.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
dependencies = [
"pkcs8",
"signature",
]
......@@ -2489,9 +2937,11 @@ checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9"
dependencies = [
"curve25519-dalek",
"ed25519",
"serde",
"sha2",
"signature",
"subtle",
"zeroize",
]
[[package]]
......@@ -2655,20 +3105,33 @@ dependencies = [
[[package]]
name = "etcd-client"
version = "0.16.1"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822"
checksum = "8acfe553027cd07fc5fafa81a84f19a7a87eaffaccd2162b6db05e8d6ce98084"
dependencies = [
"http 1.3.1",
"prost 0.13.5",
"prost 0.14.1",
"tokio",
"tokio-stream",
"tonic 0.13.1",
"tonic-build",
"tonic 0.14.2",
"tonic-build 0.14.2",
"tonic-prost",
"tonic-prost-build",
"tower 0.5.2",
"tower-service",
]
[[package]]
name = "etcetera"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26c7b13d0780cb82722fd59f6f57f925e143427e4a75313a6c77243bf5326ae6"
dependencies = [
"cfg-if 1.0.4",
"home",
"windows-sys 0.59.0",
]
[[package]]
name = "event-listener"
version = "5.4.1"
......@@ -2806,6 +3269,7 @@ dependencies = [
"pear",
"serde",
"serde_json",
"serde_yaml",
"tempfile",
"toml 0.8.23",
"uncased",
......@@ -2826,9 +3290,9 @@ dependencies = [
[[package]]
name = "find-msvc-tools"
version = "0.1.4"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127"
checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844"
[[package]]
name = "fixedbitset"
......@@ -2933,9 +3397,9 @@ dependencies = [
[[package]]
name = "fs-err"
version = "3.1.3"
version = "3.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ad492b2cf1d89d568a43508ab24f98501fe03f2f31c01e1d0fe7366a71745d2"
checksum = "62d91fd049c123429b018c47887d3f75a265540dd3c30ba9cb7bae9197edb03a"
dependencies = [
"autocfg",
"tokio",
......@@ -2997,6 +3461,16 @@ dependencies = [
"futures-util",
]
[[package]]
name = "futures-bounded"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91f328e7fb845fc832912fb6a34f40cf6d1888c92f974d1893a54e97b5ff542e"
dependencies = [
"futures-timer",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
......@@ -3022,6 +3496,7 @@ dependencies = [
"futures-core",
"futures-task",
"futures-util",
"num_cpus",
]
[[package]]
......@@ -3030,6 +3505,16 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-lite"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad"
dependencies = [
"futures-core",
"pin-project-lite",
]
[[package]]
name = "futures-macro"
version = "0.3.31"
......@@ -3041,6 +3526,17 @@ dependencies = [
"syn 2.0.110",
]
[[package]]
name = "futures-rustls"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f2f12607f92c69b12ed746fabf9ca4f5c482cba46679c1a75b874ed7c26adb"
dependencies = [
"futures-io",
"rustls",
"rustls-pki-types",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
......@@ -3379,11 +3875,21 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "ghash"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"
dependencies = [
"opaque-debug",
"polyval",
]
[[package]]
name = "gif"
version = "0.13.3"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ae047235e33e2829703574b54fdec96bfbad892062d97fed2f76022287de61b"
checksum = "f954a9e9159ec994f73a30a12b96a702dde78f5547bcb561174597924f7d4162"
dependencies = [
"color_quant",
"weezl",
......@@ -3591,10 +4097,75 @@ dependencies = [
"serde_json",
"thiserror 2.0.17",
"tokio",
"ureq",
"ureq 2.12.1",
"windows-sys 0.60.2",
]
[[package]]
name = "hickory-proto"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502"
dependencies = [
"async-trait",
"cfg-if 1.0.4",
"data-encoding",
"enum-as-inner",
"futures-channel",
"futures-io",
"futures-util",
"idna",
"ipnet",
"once_cell",
"rand 0.9.2",
"ring",
"socket2 0.5.10",
"thiserror 2.0.17",
"tinyvec",
"tokio",
"tracing",
"url",
]
[[package]]
name = "hickory-resolver"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc62a9a99b0bfb44d2ab95a7208ac952d31060efc16241c87eaf36406fecf87a"
dependencies = [
"cfg-if 1.0.4",
"futures-util",
"hickory-proto",
"ipconfig",
"moka",
"once_cell",
"parking_lot",
"rand 0.9.2",
"resolv-conf",
"smallvec",
"thiserror 2.0.17",
"tokio",
"tracing",
]
[[package]]
name = "hkdf"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7"
dependencies = [
"hmac",
]
[[package]]
name = "hmac"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [
"digest",
]
[[package]]
name = "home"
version = "0.5.12"
......@@ -3757,9 +4328,9 @@ dependencies = [
[[package]]
name = "hyper"
version = "1.7.0"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e"
checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11"
dependencies = [
"atomic-waker",
"bytes",
......@@ -3778,6 +4349,21 @@ dependencies = [
"want",
]
[[package]]
name = "hyper-named-pipe"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278"
dependencies = [
"hex",
"hyper 1.8.1",
"hyper-util",
"pin-project-lite",
"tokio",
"tower-service",
"winapi 0.3.9",
]
[[package]]
name = "hyper-rustls"
version = "0.27.7"
......@@ -3785,7 +4371,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
dependencies = [
"http 1.3.1",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"log",
"rustls",
......@@ -3803,7 +4389,7 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
dependencies = [
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"pin-project-lite",
"tokio",
......@@ -3818,7 +4404,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
dependencies = [
"bytes",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"native-tls",
"tokio",
......@@ -3828,9 +4414,9 @@ dependencies = [
[[package]]
name = "hyper-util"
version = "0.1.17"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8"
checksum = "52e9a2a24dc5c6821e71a7030e1e14b7b632acac55c40e9d2e082c621261bb56"
dependencies = [
"base64 0.22.1",
"bytes",
......@@ -3839,7 +4425,7 @@ dependencies = [
"futures-util",
"http 1.3.1",
"http-body 1.0.1",
"hyper 1.7.0",
"hyper 1.8.1",
"ipnet",
"libc",
"percent-encoding",
......@@ -3852,6 +4438,21 @@ dependencies = [
"windows-registry",
]
[[package]]
name = "hyperlocal"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7"
dependencies = [
"hex",
"http-body-util",
"hyper 1.8.1",
"hyper-util",
"pin-project-lite",
"tokio",
"tower-service",
]
[[package]]
name = "iana-time-zone"
version = "0.1.64"
......@@ -3985,41 +4586,95 @@ dependencies = [
]
[[package]]
name = "image"
version = "0.25.8"
name = "if-addrs"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "529feb3e6769d234375c4cf1ee2ce713682b8e76538cb13f9fc23e1400a591e7"
checksum = "cabb0019d51a643781ff15c9c8a3e5dedc365c47211270f4e8f82812fedd8f0a"
dependencies = [
"bytemuck",
"byteorder-lite",
"color_quant",
"exr",
"gif",
"image-webp",
"moxcms",
"num-traits",
"png",
"qoi",
"ravif",
"rayon",
"rgb",
"tiff",
"zune-core",
"zune-jpeg",
"libc",
"windows-sys 0.48.0",
]
[[package]]
name = "image-webp"
version = "0.2.4"
name = "if-watch"
version = "3.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3"
checksum = "cdf9d64cfcf380606e64f9a0bcf493616b65331199f984151a6fa11a7b3cde38"
dependencies = [
"byteorder-lite",
"quick-error 2.0.1",
"async-io",
"core-foundation 0.9.4",
"fnv",
"futures",
"if-addrs",
"ipnet",
"log",
"netlink-packet-core",
"netlink-packet-route",
"netlink-proto",
"netlink-sys",
"rtnetlink",
"system-configuration 0.6.1",
"tokio",
"windows 0.53.0",
]
[[package]]
name = "imgref"
name = "igd-next"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "516893339c97f6011282d5825ac94fc1c7aad5cad26bdc2d0cee068c0bf97f97"
dependencies = [
"async-trait",
"attohttpc",
"bytes",
"futures",
"http 1.3.1",
"http-body-util",
"hyper 1.8.1",
"hyper-util",
"log",
"rand 0.9.2",
"tokio",
"url",
"xmltree",
]
[[package]]
name = "image"
version = "0.25.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a"
dependencies = [
"bytemuck",
"byteorder-lite",
"color_quant",
"exr",
"gif",
"image-webp",
"moxcms",
"num-traits",
"png",
"qoi",
"ravif",
"rayon",
"rgb",
"tiff",
"zune-core 0.5.0",
"zune-jpeg 0.5.5",
]
[[package]]
name = "image-webp"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3"
dependencies = [
"byteorder-lite",
"quick-error 2.0.1",
]
[[package]]
name = "imgref"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"
......@@ -4100,6 +4755,15 @@ dependencies = [
"libc",
]
[[package]]
name = "inout"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"generic-array",
]
[[package]]
name = "insta"
version = "1.43.2"
......@@ -4159,6 +4823,18 @@ dependencies = [
"libc",
]
[[package]]
name = "ipconfig"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f"
dependencies = [
"socket2 0.5.10",
"widestring",
"windows-sys 0.48.0",
"winreg",
]
[[package]]
name = "ipnet"
version = "2.11.0"
......@@ -4231,15 +4907,6 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.13.0"
......@@ -4391,7 +5058,7 @@ dependencies = [
"anyhow",
"base64 0.21.7",
"bytecount",
"clap 4.5.51",
"clap 4.5.52",
"fancy-regex 0.11.0",
"fraction",
"getrandom 0.2.16",
......@@ -4433,6 +5100,15 @@ dependencies = [
"serde_json",
]
[[package]]
name = "keccak"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654"
dependencies = [
"cpufeatures",
]
[[package]]
name = "kernel32-sys"
version = "0.2.2"
......@@ -4491,7 +5167,7 @@ dependencies = [
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-rustls",
"hyper-timeout",
"hyper-util",
......@@ -4643,6 +5319,320 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"
[[package]]
name = "libp2p"
version = "0.56.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce71348bf5838e46449ae240631117b487073d5f347c06d434caddcb91dceb5a"
dependencies = [
"bytes",
"either",
"futures",
"futures-timer",
"getrandom 0.2.16",
"libp2p-allow-block-list",
"libp2p-connection-limits",
"libp2p-core",
"libp2p-dns",
"libp2p-identity",
"libp2p-mdns",
"libp2p-noise",
"libp2p-pnet",
"libp2p-quic",
"libp2p-swarm",
"libp2p-tcp",
"libp2p-upnp",
"libp2p-yamux",
"multiaddr",
"pin-project",
"rw-stream-sink",
"thiserror 2.0.17",
]
[[package]]
name = "libp2p-allow-block-list"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d16ccf824ee859ca83df301e1c0205270206223fd4b1f2e512a693e1912a8f4a"
dependencies = [
"libp2p-core",
"libp2p-identity",
"libp2p-swarm",
]
[[package]]
name = "libp2p-connection-limits"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a18b8b607cf3bfa2f8c57db9c7d8569a315d5cc0a282e6bfd5ebfc0a9840b2a0"
dependencies = [
"libp2p-core",
"libp2p-identity",
"libp2p-swarm",
]
[[package]]
name = "libp2p-core"
version = "0.43.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d28e2d2def7c344170f5c6450c0dbe3dfef655610dbfde2f6ac28a527abbe36"
dependencies = [
"either",
"fnv",
"futures",
"futures-timer",
"libp2p-identity",
"multiaddr",
"multihash",
"multistream-select",
"parking_lot",
"pin-project",
"quick-protobuf",
"rand 0.8.5",
"rw-stream-sink",
"thiserror 2.0.17",
"tracing",
"unsigned-varint 0.8.0",
"web-time",
]
[[package]]
name = "libp2p-dns"
version = "0.44.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b770c1c8476736ca98c578cba4b505104ff8e842c2876b528925f9766379f9a"
dependencies = [
"async-trait",
"futures",
"hickory-resolver",
"libp2p-core",
"libp2p-identity",
"parking_lot",
"smallvec",
"tracing",
]
[[package]]
name = "libp2p-identity"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3104e13b51e4711ff5738caa1fb54467c8604c2e94d607e27745bcf709068774"
dependencies = [
"bs58",
"ed25519-dalek",
"hkdf",
"multihash",
"quick-protobuf",
"rand 0.8.5",
"sha2",
"thiserror 2.0.17",
"tracing",
"zeroize",
]
[[package]]
name = "libp2p-kad"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13d3fd632a5872ec804d37e7413ceea20588f69d027a0fa3c46f82574f4dee60"
dependencies = [
"asynchronous-codec",
"bytes",
"either",
"fnv",
"futures",
"futures-bounded",
"futures-timer",
"libp2p-core",
"libp2p-identity",
"libp2p-swarm",
"quick-protobuf",
"quick-protobuf-codec",
"rand 0.8.5",
"sha2",
"smallvec",
"thiserror 2.0.17",
"tracing",
"uint",
"web-time",
]
[[package]]
name = "libp2p-mdns"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c66872d0f1ffcded2788683f76931be1c52e27f343edb93bc6d0bcd8887be443"
dependencies = [
"futures",
"hickory-proto",
"if-watch",
"libp2p-core",
"libp2p-identity",
"libp2p-swarm",
"rand 0.8.5",
"smallvec",
"socket2 0.5.10",
"tokio",
"tracing",
]
[[package]]
name = "libp2p-noise"
version = "0.46.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc73eacbe6462a0eb92a6527cac6e63f02026e5407f8831bde8293f19217bfbf"
dependencies = [
"asynchronous-codec",
"bytes",
"futures",
"libp2p-core",
"libp2p-identity",
"multiaddr",
"multihash",
"quick-protobuf",
"rand 0.8.5",
"snow",
"static_assertions",
"thiserror 2.0.17",
"tracing",
"x25519-dalek",
"zeroize",
]
[[package]]
name = "libp2p-pnet"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf240b834dfa3f8b48feb2c4b87bb2cf82751543001b6ee86077f48183b18d52"
dependencies = [
"futures",
"pin-project",
"rand 0.8.5",
"salsa20",
"sha3",
"tracing",
]
[[package]]
name = "libp2p-quic"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8dc448b2de9f4745784e3751fe8bc6c473d01b8317edd5ababcb0dec803d843f"
dependencies = [
"futures",
"futures-timer",
"if-watch",
"libp2p-core",
"libp2p-identity",
"libp2p-tls",
"quinn",
"rand 0.8.5",
"ring",
"rustls",
"socket2 0.5.10",
"thiserror 2.0.17",
"tokio",
"tracing",
]
[[package]]
name = "libp2p-swarm"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6aa762e5215919a34e31c35d4b18bf2e18566ecab7f8a3d39535f4a3068f8b62"
dependencies = [
"either",
"fnv",
"futures",
"futures-timer",
"libp2p-core",
"libp2p-identity",
"libp2p-swarm-derive",
"lru",
"multistream-select",
"rand 0.8.5",
"smallvec",
"tokio",
"tracing",
"web-time",
]
[[package]]
name = "libp2p-swarm-derive"
version = "0.35.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd297cf53f0cb3dee4d2620bb319ae47ef27c702684309f682bdb7e55a18ae9c"
dependencies = [
"heck 0.5.0",
"quote",
"syn 2.0.110",
]
[[package]]
name = "libp2p-tcp"
version = "0.44.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65b4e030c52c46c8d01559b2b8ca9b7c4185f10576016853129ca1fe5cd1a644"
dependencies = [
"futures",
"futures-timer",
"if-watch",
"libc",
"libp2p-core",
"socket2 0.5.10",
"tokio",
"tracing",
]
[[package]]
name = "libp2p-tls"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96ff65a82e35375cbc31ebb99cacbbf28cb6c4fefe26bf13756ddcf708d40080"
dependencies = [
"futures",
"futures-rustls",
"libp2p-core",
"libp2p-identity",
"rcgen",
"ring",
"rustls",
"rustls-webpki 0.103.8",
"thiserror 2.0.17",
"x509-parser",
"yasna",
]
[[package]]
name = "libp2p-upnp"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4757e65fe69399c1a243bbb90ec1ae5a2114b907467bf09f3575e899815bb8d3"
dependencies = [
"futures",
"futures-timer",
"igd-next",
"libp2p-core",
"libp2p-swarm",
"tokio",
"tracing",
]
[[package]]
name = "libp2p-yamux"
version = "0.47.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f15df094914eb4af272acf9adaa9e287baa269943f32ea348ba29cfb9bfc60d8"
dependencies = [
"either",
"futures",
"libp2p-core",
"thiserror 2.0.17",
"tracing",
"yamux 0.12.1",
"yamux 0.13.8",
]
[[package]]
name = "libredox"
version = "0.1.10"
......@@ -4738,6 +5728,15 @@ dependencies = [
"vob",
]
[[package]]
name = "lru"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38"
dependencies = [
"hashbrown 0.15.5",
]
[[package]]
name = "lru-slab"
version = "0.1.2"
......@@ -4858,6 +5857,17 @@ dependencies = [
"web_atoms",
]
[[package]]
name = "match-lookup"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1265724d8cb29dbbc2b0f06fffb8bf1a8c0cf73a78eede9ba73a4a66c52a981e"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "match_token"
version = "0.1.0"
......@@ -5093,7 +6103,7 @@ dependencies = [
"anyhow",
"candle-core 0.9.1 (git+https://github.com/EricLBuehler/candle.git?rev=7511e510)",
"candle-nn",
"clap 4.5.51",
"clap 4.5.52",
"either",
"futures",
"image",
......@@ -5138,7 +6148,7 @@ dependencies = [
"candle-nn",
"cfgrammar",
"chrono",
"clap 4.5.51",
"clap 4.5.52",
"csv",
"derive-new",
"derive_more 2.0.1",
......@@ -5300,7 +6310,7 @@ dependencies = [
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-util",
"log",
"rand 0.9.2",
......@@ -5318,7 +6328,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9462ec6cd8b3d6beaa262ad0907a8ba297c7e3a220c11e4159742d8c275588eb"
dependencies = [
"anyhow",
"clap 4.5.51",
"clap 4.5.52",
"colored",
"futures",
"modelexpress-common",
......@@ -5342,7 +6352,7 @@ dependencies = [
"anyhow",
"async-trait",
"chrono",
"clap 4.5.51",
"clap 4.5.52",
"config",
"hf-hub",
"jiff",
......@@ -5353,10 +6363,28 @@ dependencies = [
"thiserror 2.0.17",
"tokio",
"tonic 0.13.1",
"tonic-build",
"tonic-build 0.13.1",
"tracing",
]
[[package]]
name = "moka"
version = "0.12.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8261cd88c312e0004c1d51baad2980c66528dfdb2bee62003e643a4d8f86b077"
dependencies = [
"crossbeam-channel",
"crossbeam-epoch",
"crossbeam-utils",
"equivalent",
"parking_lot",
"portable-atomic",
"rustc_version",
"smallvec",
"tagptr",
"uuid 1.18.1",
]
[[package]]
name = "monostate"
version = "0.1.18"
......@@ -5389,12 +6417,67 @@ dependencies = [
"pxfm",
]
[[package]]
name = "multiaddr"
version = "0.18.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe6351f60b488e04c1d21bc69e56b89cb3f5e8f5d22557d6e8031bdfd79b6961"
dependencies = [
"arrayref",
"byteorder",
"data-encoding",
"libp2p-identity",
"multibase",
"multihash",
"percent-encoding",
"serde",
"static_assertions",
"unsigned-varint 0.8.0",
"url",
]
[[package]]
name = "multibase"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8694bb4835f452b0e3bb06dbebb1d6fc5385b6ca1caf2e55fd165c042390ec77"
dependencies = [
"base-x",
"base256emoji",
"data-encoding",
"data-encoding-macro",
]
[[package]]
name = "multihash"
version = "0.19.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b430e7953c29dd6a09afc29ff0bb69c6e306329ee6794700aee27b76a1aea8d"
dependencies = [
"core2",
"unsigned-varint 0.8.0",
]
[[package]]
name = "multimap"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
[[package]]
name = "multistream-select"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea0df8e5eec2298a62b326ee4f0d7fe1a6b90a09dfcf9df37b38f947a8c42f19"
dependencies = [
"bytes",
"futures",
"log",
"pin-project",
"smallvec",
"unsigned-varint 0.7.2",
]
[[package]]
name = "nalgebra"
version = "0.33.2"
......@@ -5480,6 +6563,70 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "netlink-packet-core"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72724faf704479d67b388da142b186f916188505e7e0b26719019c525882eda4"
dependencies = [
"anyhow",
"byteorder",
"netlink-packet-utils",
]
[[package]]
name = "netlink-packet-route"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "053998cea5a306971f88580d0829e90f270f940befd7cf928da179d4187a5a66"
dependencies = [
"anyhow",
"bitflags 1.3.2",
"byteorder",
"libc",
"netlink-packet-core",
"netlink-packet-utils",
]
[[package]]
name = "netlink-packet-utils"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ede8a08c71ad5a95cdd0e4e52facd37190977039a4704eb82a283f713747d34"
dependencies = [
"anyhow",
"byteorder",
"paste",
"thiserror 1.0.69",
]
[[package]]
name = "netlink-proto"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72452e012c2f8d612410d89eea01e2d9b56205274abb35d53f60200b2ec41d60"
dependencies = [
"bytes",
"futures",
"log",
"netlink-packet-core",
"netlink-sys",
"thiserror 2.0.17",
]
[[package]]
name = "netlink-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16c903aa70590cb93691bf97a767c8d1d6122d2cc9070433deb3bbf36ce8bd23"
dependencies = [
"bytes",
"futures",
"libc",
"log",
"tokio",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
......@@ -5574,6 +6721,12 @@ dependencies = [
"signatory",
]
[[package]]
name = "nohash-hasher"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451"
[[package]]
name = "nom"
version = "7.1.3"
......@@ -5739,70 +6892,229 @@ dependencies = [
]
[[package]]
name = "num-traits"
version = "0.2.19"
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
"libm",
]
[[package]]
name = "num_cpus"
version = "1.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
dependencies = [
"hermit-abi 0.5.2",
"libc",
]
[[package]]
name = "num_enum"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c"
dependencies = [
"num_enum_derive",
"rustversion",
]
[[package]]
name = "num_enum_derive"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn 2.0.110",
]
[[package]]
name = "num_threads"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9"
dependencies = [
"libc",
]
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "objc"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
dependencies = [
"malloc_buf",
"objc_exception",
]
[[package]]
name = "objc2"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c2599ce0ec54857b29ce62166b0ed9b4f6f1a70ccc9a71165b6154caca8c05"
dependencies = [
"objc2-encode",
]
[[package]]
name = "objc2-cloud-kit"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73ad74d880bb43877038da939b7427bba67e9dd42004a18b809ba7d87cee241c"
dependencies = [
"bitflags 2.10.0",
"objc2",
"objc2-foundation",
]
[[package]]
name = "objc2-core-data"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b402a653efbb5e82ce4df10683b6b28027616a2715e90009947d50b8dd298fa"
dependencies = [
"objc2",
"objc2-foundation",
]
[[package]]
name = "objc2-core-foundation"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536"
dependencies = [
"bitflags 2.10.0",
"dispatch2",
"objc2",
]
[[package]]
name = "objc2-core-graphics"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807"
dependencies = [
"bitflags 2.10.0",
"dispatch2",
"objc2",
"objc2-core-foundation",
"objc2-io-surface",
]
[[package]]
name = "objc2-core-image"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5d563b38d2b97209f8e861173de434bd0214cf020e3423a52624cd1d989f006"
dependencies = [
"objc2",
"objc2-foundation",
]
[[package]]
name = "objc2-core-location"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
checksum = "ca347214e24bc973fc025fd0d36ebb179ff30536ed1f80252706db19ee452009"
dependencies = [
"autocfg",
"libm",
"objc2",
"objc2-foundation",
]
[[package]]
name = "num_cpus"
version = "1.17.0"
name = "objc2-core-text"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
checksum = "0cde0dfb48d25d2b4862161a4d5fcc0e3c24367869ad306b0c9ec0073bfed92d"
dependencies = [
"hermit-abi 0.5.2",
"libc",
"bitflags 2.10.0",
"objc2",
"objc2-core-foundation",
"objc2-core-graphics",
]
[[package]]
name = "num_enum"
version = "0.7.5"
name = "objc2-encode"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c"
dependencies = [
"num_enum_derive",
"rustversion",
]
checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33"
[[package]]
name = "num_enum_derive"
version = "0.7.5"
name = "objc2-foundation"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7"
checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn 2.0.110",
"bitflags 2.10.0",
"block2",
"libc",
"objc2",
"objc2-core-foundation",
]
[[package]]
name = "num_threads"
version = "0.1.7"
name = "objc2-io-surface"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9"
checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d"
dependencies = [
"libc",
"bitflags 2.10.0",
"objc2",
"objc2-core-foundation",
]
[[package]]
name = "number_prefix"
version = "0.4.0"
name = "objc2-quartz-core"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f"
dependencies = [
"bitflags 2.10.0",
"objc2",
"objc2-core-foundation",
"objc2-foundation",
]
[[package]]
name = "objc"
version = "0.2.7"
name = "objc2-ui-kit"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "915b1b472bc21c53464d6c8461c9d3af805ba1ef837e1cac254428f4a77177b1"
checksum = "d87d638e33c06f577498cbcc50491496a3ed4246998a7fbba7ccb98b1e7eab22"
dependencies = [
"malloc_buf",
"objc_exception",
"bitflags 2.10.0",
"block2",
"objc2",
"objc2-cloud-kit",
"objc2-core-data",
"objc2-core-foundation",
"objc2-core-graphics",
"objc2-core-image",
"objc2-core-location",
"objc2-core-text",
"objc2-foundation",
"objc2-quartz-core",
"objc2-user-notifications",
]
[[package]]
name = "objc2-user-notifications"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9df9128cbbfef73cda168416ccf7f837b62737d748333bfe9ab71c245d76613e"
dependencies = [
"objc2",
"objc2-foundation",
]
[[package]]
......@@ -5833,11 +7145,24 @@ dependencies = [
"nonmax",
]
[[package]]
name = "oid-registry"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12f40cff3dde1b6087cc5d5f5d4d65712f34016a03ed60e9c08dcc392736b5b7"
dependencies = [
"asn1-rs",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
dependencies = [
"critical-section",
"portable-atomic",
]
[[package]]
name = "once_cell_polyfill"
......@@ -5879,6 +7204,12 @@ version = "11.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
[[package]]
name = "opaque-debug"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
[[package]]
name = "openai-harmony"
version = "0.0.3"
......@@ -5888,7 +7219,7 @@ dependencies = [
"anyhow",
"base64 0.22.1",
"bstr",
"clap 4.5.51",
"clap 4.5.52",
"fancy-regex 0.13.0",
"futures",
"image",
......@@ -6069,14 +7400,18 @@ dependencies = [
[[package]]
name = "os_info"
version = "3.12.0"
version = "3.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0e1ac5fde8d43c34139135df8ea9ee9465394b2d8d20f032d38998f64afffc3"
checksum = "7c39b5918402d564846d5aba164c09a66cc88d232179dfd3e3c619a25a268392"
dependencies = [
"android_system_properties",
"log",
"plist",
"nix 0.30.1",
"objc2",
"objc2-foundation",
"objc2-ui-kit",
"serde",
"windows-sys 0.52.0",
"windows-sys 0.61.2",
]
[[package]]
......@@ -6118,12 +7453,43 @@ dependencies = [
"windows-link 0.2.1",
]
[[package]]
name = "parse-display"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "914a1c2265c98e2446911282c6ac86d8524f495792c38c5bd884f80499c7538a"
dependencies = [
"parse-display-derive",
"regex",
"regex-syntax",
]
[[package]]
name = "parse-display-derive"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ae7800a4c974efd12df917266338e79a7a74415173caf7e70aa0a0707345281"
dependencies = [
"proc-macro2",
"quote",
"regex",
"regex-syntax",
"structmeta",
"syn 2.0.110",
]
[[package]]
name = "paste"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "pastey"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec"
[[package]]
name = "pathdiff"
version = "0.2.3"
......@@ -6331,19 +7697,6 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plist"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "740ebea15c5d1428f910cd1a5f52cebf8d25006245ed8ade92702f4943d91e07"
dependencies = [
"base64 0.22.1",
"indexmap 2.12.0",
"quick-xml",
"serde",
"time",
]
[[package]]
name = "plotters"
version = "0.3.7"
......@@ -6385,6 +7738,43 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "polling"
version = "3.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218"
dependencies = [
"cfg-if 1.0.4",
"concurrent-queue",
"hermit-abi 0.5.2",
"pin-project-lite",
"rustix",
"windows-sys 0.61.2",
]
[[package]]
name = "poly1305"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf"
dependencies = [
"cpufeatures",
"opaque-debug",
"universal-hash",
]
[[package]]
name = "polyval"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"
dependencies = [
"cfg-if 1.0.4",
"cpufeatures",
"opaque-debug",
"universal-hash",
]
[[package]]
name = "portable-atomic"
version = "1.11.1"
......@@ -6589,7 +7979,29 @@ dependencies = [
"petgraph",
"prettyplease",
"prost 0.13.5",
"prost-types",
"prost-types 0.13.5",
"regex",
"syn 2.0.110",
"tempfile",
]
[[package]]
name = "prost-build"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac6c3320f9abac597dcbc668774ef006702672474aad53c6d596b62e487b40b1"
dependencies = [
"heck 0.5.0",
"itertools 0.14.0",
"log",
"multimap",
"once_cell",
"petgraph",
"prettyplease",
"prost 0.14.1",
"prost-types 0.14.1",
"pulldown-cmark",
"pulldown-cmark-to-cmark",
"regex",
"syn 2.0.110",
"tempfile",
......@@ -6630,6 +8042,15 @@ dependencies = [
"prost 0.13.5",
]
[[package]]
name = "prost-types"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9b4db3d6da204ed77bb26ba83b6122a73aeb2e87e25fbf7ad2e84c4ccbf8f72"
dependencies = [
"prost 0.14.1",
]
[[package]]
name = "protobuf"
version = "3.7.2"
......@@ -6650,6 +8071,26 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "pulldown-cmark"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0"
dependencies = [
"bitflags 2.10.0",
"memchr",
"unicase",
]
[[package]]
name = "pulldown-cmark-to-cmark"
version = "21.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8246feae3db61428fd0bb94285c690b460e4517d83152377543ca802357785f1"
dependencies = [
"pulldown-cmark",
]
[[package]]
name = "pulp"
version = "0.18.22"
......@@ -6707,12 +8148,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
[[package]]
name = "quick-xml"
version = "0.38.4"
name = "quick-protobuf"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
checksum = "9d6da84cc204722a989e01ba2f6e1e276e190f22263d0cb6ce8526fcdb0d2e1f"
dependencies = [
"memchr",
"byteorder",
]
[[package]]
name = "quick-protobuf-codec"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15a0580ab32b169745d7a39db2ba969226ca16738931be152a3209b409de2474"
dependencies = [
"asynchronous-codec",
"bytes",
"quick-protobuf",
"thiserror 1.0.69",
"unsigned-varint 0.8.0",
]
[[package]]
......@@ -6723,6 +8177,7 @@ checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
dependencies = [
"bytes",
"cfg_aliases",
"futures-io",
"pin-project-lite",
"quinn-proto",
"quinn-udp",
......@@ -6894,19 +8349,21 @@ dependencies = [
[[package]]
name = "rav1e"
version = "0.7.1"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9"
checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b"
dependencies = [
"aligned-vec",
"arbitrary",
"arg_enum_proc_macro",
"arrayvec",
"av-scenechange",
"av1-grain",
"bitstream-io",
"built",
"cfg-if 1.0.4",
"interpolate_name",
"itertools 0.12.1",
"itertools 0.14.0",
"libc",
"libfuzzer-sys",
"log",
......@@ -6915,23 +8372,21 @@ dependencies = [
"noop_proc_macro",
"num-derive",
"num-traits",
"once_cell",
"paste",
"profiling",
"rand 0.8.5",
"rand_chacha 0.3.1",
"rand 0.9.2",
"rand_chacha 0.9.0",
"simd_helpers",
"system-deps",
"thiserror 1.0.69",
"thiserror 2.0.17",
"v_frame",
"wasm-bindgen",
]
[[package]]
name = "ravif"
version = "0.11.20"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5825c26fddd16ab9f515930d49028a630efec172e903483c94796cfe31893e6b"
checksum = "ef69c1990ceef18a116855938e74793a5f7496ee907562bd0857b6ac734ab285"
dependencies = [
"avif-serialize",
"imgref",
......@@ -6997,6 +8452,19 @@ dependencies = [
"crossbeam-utils",
]
[[package]]
name = "rcgen"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75e669e5202259b5314d1ea5397316ad400819437857b90861765f24c4cf80a2"
dependencies = [
"pem",
"ring",
"rustls-pki-types",
"time",
"yasna",
]
[[package]]
name = "realfft"
version = "3.5.0"
......@@ -7145,7 +8613,7 @@ dependencies = [
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-rustls",
"hyper-tls",
"hyper-util",
......@@ -7195,6 +8663,12 @@ dependencies = [
"thiserror 1.0.69",
]
[[package]]
name = "resolv-conf"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b3789b30bd25ba102de4beabd95d21ac45b69b1be7d14522bab988c526d6799"
[[package]]
name = "rgb"
version = "0.8.52"
......@@ -7321,6 +8795,24 @@ dependencies = [
"syn 2.0.110",
]
[[package]]
name = "rtnetlink"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a552eb82d19f38c3beed3f786bd23aa434ceb9ac43ab44419ca6d67a7e186c0"
dependencies = [
"futures",
"log",
"netlink-packet-core",
"netlink-packet-route",
"netlink-packet-utils",
"netlink-proto",
"netlink-sys",
"nix 0.26.4",
"thiserror 1.0.69",
"tokio",
]
[[package]]
name = "rubato"
version = "0.16.2"
......@@ -7438,6 +8930,15 @@ dependencies = [
"transpose",
]
[[package]]
name = "rusticata-macros"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632"
dependencies = [
"nom 7.1.3",
]
[[package]]
name = "rustix"
version = "1.1.2"
......@@ -7608,6 +9109,17 @@ dependencies = [
"wait-timeout",
]
[[package]]
name = "rw-stream-sink"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8c9026ff5d2f23da5e45bbc283f156383001bfb09c4e44256d02c1a685fe9a1"
dependencies = [
"futures",
"pin-project",
"static_assertions",
]
[[package]]
name = "ryu"
version = "1.0.20"
......@@ -7643,6 +9155,15 @@ dependencies = [
"serde_json",
]
[[package]]
name = "salsa20"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213"
dependencies = [
"cipher",
]
[[package]]
name = "same-file"
version = "1.0.6"
......@@ -8009,9 +9530,9 @@ dependencies = [
[[package]]
name = "serde_with"
version = "3.15.1"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa66c845eee442168b2c8134fec70ac50dc20e760769c8ba0ad1319ca1959b04"
checksum = "10574371d41b0d9b2cff89418eda27da52bcaff2cc8741db26382a77c29131f1"
dependencies = [
"base64 0.22.1",
"chrono",
......@@ -8028,9 +9549,9 @@ dependencies = [
[[package]]
name = "serde_with_macros"
version = "3.15.1"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b91a903660542fced4e99881aa481bdbaec1634568ee02e0b8bd57c64cb38955"
checksum = "08a72d8216842fdd57820dc78d840bef99248e35fb2554ff923319e60f2d686b"
dependencies = [
"darling 0.21.3",
"proc-macro2",
......@@ -8107,6 +9628,16 @@ dependencies = [
"digest",
]
[[package]]
name = "sha3"
version = "0.10.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60"
dependencies = [
"digest",
"keccak",
]
[[package]]
name = "sharded-slab"
version = "0.1.7"
......@@ -8232,6 +9763,23 @@ version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
[[package]]
name = "snow"
version = "0.9.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "850948bee068e713b8ab860fe1adc4d109676ab4c3b621fd8147f06b261f2f85"
dependencies = [
"aes-gcm",
"blake2",
"chacha20poly1305",
"curve25519-dalek",
"rand_core 0.6.4",
"ring",
"rustc_version",
"sha2",
"subtle",
]
[[package]]
name = "socket2"
version = "0.5.10"
......@@ -8383,6 +9931,29 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "structmeta"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329"
dependencies = [
"proc-macro2",
"quote",
"structmeta-derive",
"syn 2.0.110",
]
[[package]]
name = "structmeta-derive"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.110",
]
[[package]]
name = "strum"
version = "0.27.2"
......@@ -8633,7 +10204,7 @@ dependencies = [
"ntapi",
"once_cell",
"rayon",
"windows",
"windows 0.52.0",
]
[[package]]
......@@ -8691,6 +10262,12 @@ dependencies = [
"version-compare",
]
[[package]]
name = "tagptr"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]]
name = "target-lexicon"
version = "0.12.16"
......@@ -8741,6 +10318,35 @@ dependencies = [
"windows-sys 0.60.2",
]
[[package]]
name = "testcontainers"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f3ac71069f20ecfa60c396316c283fbf35e6833a53dff551a31b5458da05edc"
dependencies = [
"astral-tokio-tar",
"async-trait",
"bollard",
"bytes",
"docker_credential",
"either",
"etcetera",
"futures",
"log",
"memchr",
"parse-display",
"pin-project-lite",
"serde",
"serde_json",
"serde_with",
"thiserror 2.0.17",
"tokio",
"tokio-stream",
"tokio-util",
"ulid",
"url",
]
[[package]]
name = "textwrap"
version = "0.11.0"
......@@ -8810,7 +10416,7 @@ dependencies = [
"half 2.7.1",
"quick-error 2.0.1",
"weezl",
"zune-jpeg",
"zune-jpeg 0.4.21",
]
[[package]]
......@@ -9233,7 +10839,7 @@ dependencies = [
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-timeout",
"hyper-util",
"percent-encoding",
......@@ -9262,7 +10868,7 @@ dependencies = [
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-timeout",
"hyper-util",
"percent-encoding",
......@@ -9270,7 +10876,6 @@ dependencies = [
"prost 0.13.5",
"socket2 0.5.10",
"tokio",
"tokio-rustls",
"tokio-stream",
"tower 0.5.2",
"tower-layer",
......@@ -9285,18 +10890,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203"
dependencies = [
"async-trait",
"axum 0.8.4",
"base64 0.22.1",
"bytes",
"h2 0.4.12",
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper 1.8.1",
"hyper-timeout",
"hyper-util",
"percent-encoding",
"pin-project",
"socket2 0.6.1",
"sync_wrapper 1.0.2",
"tokio",
"tokio-rustls",
"tokio-stream",
"tower 0.5.2",
"tower-layer",
......@@ -9312,8 +10921,20 @@ checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847"
dependencies = [
"prettyplease",
"proc-macro2",
"prost-build",
"prost-types",
"prost-build 0.13.5",
"prost-types 0.13.5",
"quote",
"syn 2.0.110",
]
[[package]]
name = "tonic-build"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c40aaccc9f9eccf2cd82ebc111adc13030d23e887244bc9cfa5d1d636049de3"
dependencies = [
"prettyplease",
"proc-macro2",
"quote",
"syn 2.0.110",
]
......@@ -9329,6 +10950,22 @@ dependencies = [
"tonic 0.14.2",
]
[[package]]
name = "tonic-prost-build"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4a16cba4043dc3ff43fcb3f96b4c5c154c64cbd18ca8dce2ab2c6a451d058a2"
dependencies = [
"prettyplease",
"proc-macro2",
"prost-build 0.14.1",
"prost-types 0.14.1",
"quote",
"syn 2.0.110",
"tempfile",
"tonic-build 0.14.2",
]
[[package]]
name = "tower"
version = "0.4.13"
......@@ -9630,6 +11267,28 @@ dependencies = [
"ug",
]
[[package]]
name = "uint"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "909988d098b2f738727b161a106cfc7cab00c539c2687a8836f8e565976fb53e"
dependencies = [
"byteorder",
"crunchy",
"hex",
"static_assertions",
]
[[package]]
name = "ulid"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe"
dependencies = [
"rand 0.9.2",
"web-time",
]
[[package]]
name = "unarray"
version = "0.1.4"
......@@ -9782,12 +11441,34 @@ dependencies = [
"rand 0.8.5",
]
[[package]]
name = "universal-hash"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"
dependencies = [
"crypto-common",
"subtle",
]
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
[[package]]
name = "unsigned-varint"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6889a77d49f1f013504cec6bf97a2c730394adedaeb1deb5ea08949a50541105"
[[package]]
name = "unsigned-varint"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb066959b24b5196ae73cb057f45598450d2c5f71460e98c49b738086eff9c06"
[[package]]
name = "untrusted"
version = "0.9.0"
......@@ -9819,6 +11500,34 @@ dependencies = [
"webpki-roots 0.26.11",
]
[[package]]
name = "ureq"
version = "3.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a"
dependencies = [
"base64 0.22.1",
"log",
"percent-encoding",
"rustls",
"rustls-pki-types",
"ureq-proto",
"utf-8",
"webpki-roots 1.0.4",
]
[[package]]
name = "ureq-proto"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2"
dependencies = [
"base64 0.22.1",
"http 1.3.1",
"httparse",
"log",
]
[[package]]
name = "url"
version = "2.5.7"
......@@ -10289,6 +11998,16 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efc5cf48f83140dcaab716eeaea345f9e93d0018fb81162753a3f76c3397b538"
dependencies = [
"windows-core 0.53.0",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.52.0"
......@@ -10298,6 +12017,16 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.53.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dcc5b895a6377f1ab9fa55acedab1fd5ac0db66ad1e6c7f47e28a22e446a5dd"
dependencies = [
"windows-result 0.1.2",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-core"
version = "0.62.2"
......@@ -10308,7 +12037,7 @@ dependencies = [
"windows-interface",
"windows-link 0.2.1",
"windows-result 0.4.1",
"windows-strings 0.5.1",
"windows-strings",
]
[[package]]
......@@ -10347,22 +12076,22 @@ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
name = "windows-registry"
version = "0.5.3"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e"
checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
dependencies = [
"windows-link 0.1.3",
"windows-result 0.3.4",
"windows-strings 0.4.2",
"windows-link 0.2.1",
"windows-result 0.4.1",
"windows-strings",
]
[[package]]
name = "windows-result"
version = "0.3.4"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8"
dependencies = [
"windows-link 0.1.3",
"windows-targets 0.52.6",
]
[[package]]
......@@ -10374,15 +12103,6 @@ dependencies = [
"windows-link 0.2.1",
]
[[package]]
name = "windows-strings"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
dependencies = [
"windows-link 0.1.3",
]
[[package]]
name = "windows-strings"
version = "0.5.1"
......@@ -10664,12 +12384,72 @@ dependencies = [
"winapi-build",
]
[[package]]
name = "x25519-dalek"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7e468321c81fb07fa7f4c636c3972b9100f0346e5b6a9f2bd0603a52f7ed277"
dependencies = [
"curve25519-dalek",
"rand_core 0.6.4",
"serde",
"zeroize",
]
[[package]]
name = "x509-parser"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4569f339c0c402346d4a75a9e39cf8dad310e287eef1ff56d4c68e5067f53460"
dependencies = [
"asn1-rs",
"data-encoding",
"der-parser",
"lazy_static",
"nom 7.1.3",
"oid-registry",
"rusticata-macros",
"thiserror 2.0.17",
"time",
]
[[package]]
name = "xattr"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
dependencies = [
"libc",
"rustix",
]
[[package]]
name = "xml-rs"
version = "0.8.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f"
[[package]]
name = "xmltree"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7d8a75eaf6557bb84a65ace8609883db44a29951042ada9b393151532e41fcb"
dependencies = [
"xml-rs",
]
[[package]]
name = "xxhash-rust"
version = "0.8.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
[[package]]
name = "y4m"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448"
[[package]]
name = "yaml-rust2"
version = "0.10.4"
......@@ -10681,12 +12461,52 @@ dependencies = [
"hashlink",
]
[[package]]
name = "yamux"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed0164ae619f2dc144909a9f082187ebb5893693d8c0196e8085283ccd4b776"
dependencies = [
"futures",
"log",
"nohash-hasher",
"parking_lot",
"pin-project",
"rand 0.8.5",
"static_assertions",
]
[[package]]
name = "yamux"
version = "0.13.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "deab71f2e20691b4728b349c6cee8fc7223880fa67b6b4f92225ec32225447e5"
dependencies = [
"futures",
"log",
"nohash-hasher",
"parking_lot",
"pin-project",
"rand 0.9.2",
"static_assertions",
"web-time",
]
[[package]]
name = "yansi"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
[[package]]
name = "yasna"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd"
dependencies = [
"time",
]
[[package]]
name = "yoke"
version = "0.7.5"
......@@ -10780,6 +12600,20 @@ name = "zeroize"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
dependencies = [
"zeroize_derive",
]
[[package]]
name = "zeroize_derive"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.110",
]
[[package]]
name = "zeromq"
......@@ -10926,6 +12760,12 @@ version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a"
[[package]]
name = "zune-core"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "111f7d9820f05fd715df3144e254d6fc02ee4088b0644c0ffd0efc9e6d9d2773"
[[package]]
name = "zune-inflate"
version = "0.2.54"
......@@ -10941,5 +12781,14 @@ version = "0.4.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29ce2c8a9384ad323cf564b67da86e21d3cfdff87908bc1223ed5c99bc792713"
dependencies = [
"zune-core",
"zune-core 0.4.12",
]
[[package]]
name = "zune-jpeg"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc6fb7703e32e9a07fb3f757360338b3a567a5054f21b5f52a666752e333d58e"
dependencies = [
"zune-core 0.5.0",
]
......@@ -14,6 +14,7 @@ members = [
"lib/bindings/c",
"lib/bindings/python/codegen",
"lib/engines/*",
"lib/discovery",
"lib/config",
"lib/kvbm-kernels",
]
......@@ -56,7 +57,7 @@ dynamo-parsers = { path = "lib/parsers", version = "0.7.0" }
# External dependencies
anyhow = { version = "1" }
async-nats = { version = "0.40", features = ["service"] }
async-nats = { version = "0.45.0", features = ["service"] }
async-stream = { version = "0.3" }
async-trait = { version = "0.1" }
async_zmq = { version = "0.4.0" }
......@@ -69,12 +70,12 @@ chrono = { version = "0.4", default-features = false, features = [
"now",
"serde",
] }
cudarc = { version = "0.17.1", features = ["cuda-12020"] }
cudarc = { version = "0.17.8", features = ["cuda-12020"] }
dashmap = { version = "6.1" }
derive_builder = { version = "0.20" }
derive-getters = { version = "0.5" }
either = { version = "1.13", features = ["serde"] }
etcd-client = { version = "0.16", features = ["tls"] }
etcd-client = { version = "0.17.0", features = ["tls"] }
futures = { version = "0.3" }
hf-hub = { version = "0.4.2", default-features = false, features = [
"tokio",
......@@ -91,8 +92,8 @@ libc = { version = "0.2" }
oneshot = { version = "0.1.11", features = ["std", "async"] }
parking_lot = "0.12.5"
prometheus = { version = "0.14" }
rand = { version = "0.9.0" }
reqwest = { version = "0.12.22", default-features = false, features = [
rand = { version = "0.9.2" }
reqwest = { version = "0.12.24", default-features = false, features = [
"json",
"stream",
"rustls-tls",
......@@ -102,12 +103,13 @@ serde = { version = "1", features = ["derive"] }
serde_json = { version = "1" }
strum = { version = "0.27", features = ["derive"] }
tempfile = "3"
thiserror = { version = "2.0.11" }
thiserror = { version = "2.0.17" }
tokio = { version = "1", features = ["full"] }
tokio-stream = { version = "0.1" }
tokio-util = { version = "0.7", features = ["codec", "net", "rt"] }
tower-http = { version = "0.6", features = ["trace"] }
axum = { version = "=0.8.4", features = ["macros"] }
axum-core = { version = "0.5.2" }
hyper = { version = "=1.7.0" }
hyper-util = { version = "=0.1.17" }
tracing = { version = "0.1" }
......@@ -121,7 +123,7 @@ opentelemetry = { version = "0.31.0", features = ["trace"] }
opentelemetry_sdk = { version = "0.31.0", features = ["trace", "rt-tokio"] }
opentelemetry-otlp = { version = "0.31.0", features = ["trace", "grpc-tonic"] }
validator = { version = "0.20.0", features = ["derive"] }
uuid = { version = "1.17", features = ["v4", "serde"] }
uuid = { version = "1.18.1", features = ["v4", "serde"] }
url = { version = "2.5", features = ["serde"] }
xxhash-rust = { version = "0.8", features = ["xxh3", "const_xxh3"] }
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "dynamo-discovery"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
[dependencies]
# Core dependencies (always present)
anyhow = "1.0"
async-trait = "0.1"
bytes = { version = "1.8", features = ["serde"] }
dashmap = "6.1"
derive_builder = { workspace = true }
figment = { version = "0.10", features = ["toml", "yaml", "env"] }
futures = { version = "0.3" }
parking_lot = { version = "0.12" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
tokio = { version = "1.43", features = ["full"] }
tokio-util = "0.7"
thiserror = "2.0"
tracing = "0.1"
uuid = { version = "1.11", features = ["v4", "serde"] }
xxhash-rust = { version = "0.8", features = ["xxh3"] }
validator = { workspace = true }
# HTTP service dependencies (optional)
axum = { version = "0.8", optional = true }
tower = { version = "0.5", optional = true }
tower-http = { version = "0.6", features = ["trace"], optional = true }
hyper = { version = "1.5", optional = true }
reqwest = { version = "0.12", features = ["json"], optional = true }
tokio-stream = { version = "0.1", optional = true }
futures-util = { version = "0.3", optional = true }
# Etcd dependencies (optional)
etcd-client = { version = "0.17", optional = true }
tonic = { version = "0.14", optional = true }
# Libp2p dependencies (optional) - Match am-core versions
libp2p = { version = "0.56", default-features = false, features = [
"tcp",
"noise",
"yamux",
"macros",
"tokio",
"pnet",
], optional = true }
libp2p-kad = { version = "0.48", optional = true }
libp2p-mdns = { version = "0.48", features = ["tokio"], optional = true }
libp2p-swarm = { version = "0.47", optional = true }
libp2p-identity = { version = "0.2", optional = true }
blake2 = { version = "0.10", optional = true }
[dev-dependencies]
tokio = { version = "1.43", features = ["full", "test-util"] }
clap = { version = "4", features = ["derive"] }
tempfile = "3"
testcontainers = "0.25"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
[features]
default = ["p2p", "etcd"]
etcd = ["dep:etcd-client", "dep:tonic"]
p2p = ["dep:libp2p", "dep:libp2p-kad", "dep:libp2p-mdns", "dep:libp2p-swarm", "dep:libp2p-identity", "dep:blake2"]
full = ["etcd", "p2p"]
testing-etcd = [] # Enable etcd tests (default on, disable with --no-default-features)
integration-etcd = [] # Feature flag for integration tests with real etcd (requires Docker)
# http-service = ["dep:axum", "dep:tower", "dep:tower-http", "dep:hyper", "dep:reqwest", "dep:tracing-subscriber", "dep:tokio-stream", "dep:futures-util"]
# dynamo-discovery (lib/discovery)
A small, capability-driven discovery layer for the Dynamo runtime. The core idea is to separate “what the application needs to discover” from “how a particular backend provides it,” and to provide a thin manager that composes multiple backends with caching and a concise, stable API.
## Philosophy
- Discovery is application-specific. Each application defines discovery traits that describe the information it needs (e.g., peers, topics, shards, services) and the operations required to work with that information.
- Systems are concrete implementations. A system (e.g., etcd, libp2p, an HTTP microservice, S3, NATS) implements one or more of the discovery traits. Different systems have different capabilities; not every system can implement every trait or policy.
- Managers orchestrate and cache. A manager owns the logic to coordinate multiple systems that implement the same trait, deduplicate concurrent lookups, maintain a local cache, and expose a clean public API tailored for the runtime.
This division lets you grow capabilities without coupling the runtime to any one backend. Traits define the contract; systems provide the plumbing; managers keep the runtime simple and fast.
## Core Concepts
- Discovery traits
- Define what the application wants to discover and the related operations.
- Include both public-facing operations (what the runtime calls) and internal operations (used for registration, consistency checks, etc.).
- Systems
- Backend-specific code that implements one or more discovery traits (and only the parts they can support).
- Example systems: `etcd` (centralized + TTL), `libp2p` (DHT), an HTTP service client, S3, NATS, in-memory.
- A system may expose just a subset of traits based on its capability.
- Managers
- Constructed with one or more system implementations of a trait.
- Provide a concise, stable public API, while handling caching, coalescing, retries, and capability differences behind the scenes.
- Allow you to mix-and-match systems for resilience and performance (e.g., fast in-memory cache + remote etcd).
## Capability Model
- Traits describe behavior; systems opt into the parts they can implement.
- The `DiscoverySystem` abstraction can vend one or more trait implementations. If a system cannot support a trait, it simply does not provide it.
- Managers accept a set of trait implementations and will use whatever is provided, with graceful fallback rules (e.g., local cache first, then remote sources).
## Example: Peer Discovery
The peer discovery trait is used by the runtime to translate identifiers into addresses and to manage lifecycle around registration.
- Trait methods (conceptual):
- `discover_by_worker_id(worker_id) -> PeerInfo`
- `discover_by_instance_id(instance_id) -> PeerInfo`
- `register_instance(instance_id, address) -> ()`
- `unregister_instance(instance_id) -> ()`
- Manager API (public vs. internal):
- Public: discovery queries
- `discover_by_worker_id(worker_id)`
- `discover_by_instance_id(instance_id)`
- Internal: lifecycle
- Registration and unregistration are handled by the manager when it is constructed (register the local peer) and during shutdown/cleanup. These are not exposed as public manager methods.
- Why hide registration on the manager?
- Keeps the runtime call surface minimal and intentional.
- Enforces consistent lifecycle semantics (checksums, collisions, TTLs) in one place.
- Avoids leaking backend mechanics into the runtime path.
### How the Manager Works (at a glance)
- On construction, the manager registers the local peer in its local cache and in all configured remote systems that support the peer discovery trait.
- On lookup, it consults the local cache first, then queries remotes if needed. Concurrent lookups for the same key are coalesced into a shared query. Successful remote results are cached locally for future fast paths.
## Typical Wiring
- Choose your systems and build them (e.g., Etcd with TTL, Libp2p, HTTP client, or an in-memory source for tests).
- Extract the trait implementations the runtime needs (e.g., `PeerDiscovery`).
- Create a manager with the local peer and a list of trait impls:
```rust
use std::sync::Arc;
use dynamo_am_discovery::peer::{PeerInfo, WorkerAddress, InstanceId};
use dynamo_am_discovery::peer::manager::PeerDiscoveryManager;
use dynamo_am_discovery::systems::DiscoverySystem; // e.g., etcd system builds this
# async fn example(system: Arc<dyn DiscoverySystem>) -> anyhow::Result<()> {
let local_instance = InstanceId::new_v4();
let local_address = WorkerAddress::from_bytes(b"tcp://127.0.0.1:5555".as_slice());
let local_peer = PeerInfo::new(local_instance, local_address);
// Get one or more implementations of the peer discovery trait
let mut sources = Vec::new();
if let Some(peer_disc) = system.peer_discovery() {
sources.push(peer_disc);
}
// Build the manager that orchestrates cache + remotes
let manager = PeerDiscoveryManager::new(local_peer, sources).await?;
// Look up a peer by worker_id or instance_id
// (The manager will hit local cache first, then remotes as needed.)
let _maybe = manager.discover_by_worker_id(local_instance.worker_id()).await;
let _maybe2 = manager.discover_by_instance_id(local_instance).await;
# Ok(())
# }
```
Note: The manager deliberately keeps registration/unregistration internal. If your application lifecycle requires explicit registration timing, do that by constructing the manager at the appropriate point in startup, and let it handle registration with all configured systems.
## Extending the Crate
- Add a new discovery trait when the application needs to discover a new kind of thing (e.g., shard ownership). Keep the trait small and precise.
- Implement the trait in one or more systems. It’s fine if only some systems can implement it.
- Add a manager for the trait if you need composition, caching, or a slimmer public API for the runtime.
- Keep trait-level semantics strict and documented. Managers can hide backend-specific details while enforcing common policies (e.g., collision detection, address checksums, TTLs).
## Notes on Consistency and Errors
- Systems may enforce additional policies (e.g., TTL expiry in etcd, collision detection, checksum validation). Managers use these and surface simple success/not-found/backend-error semantics to the runtime.
- Local caches accelerate the common path and are populated opportunistically from successful remote lookups.
- Concurrent lookups are deduplicated to reduce load on remote systems.
## Available Systems (examples)
- Etcd-backed system (centralized, TTL-based, keep-alive, transactional collision detection).
- Libp2p-backed system (decentralized DHT).
- HTTP service client.
- In-memory (useful for tests and single-node scenarios).
Not all systems will implement every trait; use the manager’s composition to mix what you need.
## Why This Design?
- Keeps the runtime portable: swap discovery backends without changing call sites.
- Embraces partial capability: wire up the systems that can do the job, skip the rest.
- Minimizes API surface for the runtime: managers expose only the operations the runtime actually needs, while handling lifecycle internally.
- Encourages small traits and pluggable systems, so the crate can evolve without lock-in.
If you’re adding a new trait or system, keep the trait narrowly scoped, stick to clear semantics, and lean on the manager to integrate, cache, and present a clean API to the rest of the runtime.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod peer;
pub mod systems;
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Address types for peer discovery.
//!
//! This module provides types for representing worker addresses and peer information:
//! - [`WorkerAddress`]: Opaque byte representation of a peer's network address
//! - [`PeerInfo`]: Combined instance ID and worker address for a discovered peer
//!
//! These types are intentionally transport-agnostic, storing addresses as opaque bytes.
//! The interpretation of these bytes is left to the active message runtime.
use super::{InstanceId, WorkerId};
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use xxhash_rust::xxh3::xxh3_64;
/// Opaque worker address for discovery.
///
/// This is a transport-agnostic representation of a peer's network address.
/// The bytes are opaque to discovery and are interpreted by the active message runtime.
///
/// # Checksum
///
/// WorkerAddress implements a checksum via xxh3_64 for quick comparison during
/// re-registration validation.
#[derive(Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct WorkerAddress(Bytes);
impl WorkerAddress {
/// Create a new WorkerAddress from bytes.
pub fn from_bytes(bytes: impl Into<Bytes>) -> Self {
Self(bytes.into())
}
/// Get the underlying bytes.
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
/// Get the bytes as a Bytes object.
pub fn to_bytes(&self) -> Bytes {
self.0.clone()
}
/// Compute a checksum of this address for validation.
///
/// This is used to quickly check if an address has changed during re-registration.
pub fn checksum(&self) -> u64 {
xxh3_64(self.as_bytes())
}
}
impl fmt::Debug for WorkerAddress {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_tuple("WorkerAddress")
.field(&format_args!(
"len={}, xxh3_64=0x{:016x}",
self.0.len(),
self.checksum()
))
.finish()
}
}
impl fmt::Display for WorkerAddress {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "WorkerAddress(xxh3_64=0x{:016x})", self.checksum())
}
}
/// Peer information combining instance ID and worker address.
///
/// This is the primary type returned by discovery lookups. It contains everything
/// needed to connect to and identify a peer.
///
/// # Example
///
/// ```
/// use dynamo_am_discovery::{InstanceId, WorkerAddress, PeerInfo};
/// use bytes::Bytes;
///
/// let instance_id = InstanceId::new_v4();
/// let address = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
/// let peer_info = PeerInfo::new(instance_id, address);
///
/// assert_eq!(peer_info.instance_id(), instance_id);
/// assert_eq!(peer_info.worker_id(), instance_id.worker_id());
/// ```
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct PeerInfo {
/// The instance ID of the peer
pub instance_id: InstanceId,
/// The worker address for connecting to the peer
pub worker_address: WorkerAddress,
}
impl PeerInfo {
/// Create a new PeerInfo.
pub fn new(instance_id: InstanceId, worker_address: WorkerAddress) -> Self {
Self {
instance_id,
worker_address,
}
}
/// Get the instance ID.
pub fn instance_id(&self) -> InstanceId {
self.instance_id
}
/// Get the worker ID (derived from instance ID).
pub fn worker_id(&self) -> WorkerId {
self.instance_id.worker_id()
}
/// Get a reference to the worker address.
pub fn worker_address(&self) -> &WorkerAddress {
&self.worker_address
}
/// Get the worker address checksum for validation.
pub fn address_checksum(&self) -> u64 {
self.worker_address.checksum()
}
/// Consume self and return the worker address.
pub fn into_address(self) -> WorkerAddress {
self.worker_address
}
/// Decompose into instance ID and worker address.
pub fn into_parts(self) -> (InstanceId, WorkerAddress) {
(self.instance_id, self.worker_address)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_worker_address_creation() {
let bytes = Bytes::from_static(b"tcp://127.0.0.1:5555");
let address = WorkerAddress::from_bytes(bytes.clone());
assert_eq!(address.as_bytes(), bytes.as_ref());
assert_eq!(address.to_bytes(), bytes);
}
#[test]
fn test_worker_address_checksum() {
let address1 = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let address2 = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let address3 = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:6666"));
// Same bytes = same checksum
assert_eq!(address1.checksum(), address2.checksum());
// Different bytes = (likely) different checksum
assert_ne!(address1.checksum(), address3.checksum());
}
#[test]
fn test_worker_address_equality() {
let address1 = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let address2 = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let address3 = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:6666"));
assert_eq!(address1, address2);
assert_ne!(address1, address3);
}
#[test]
fn test_worker_address_debug() {
let address = WorkerAddress::from_bytes(Bytes::from_static(b"test"));
let debug_str = format!("{:?}", address);
assert!(debug_str.contains("WorkerAddress"));
assert!(debug_str.contains("len=4"));
assert!(debug_str.contains("xxh3_64="));
}
#[test]
fn test_peer_info_creation() {
let instance_id = InstanceId::new_v4();
let address = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let peer_info = PeerInfo::new(instance_id, address.clone());
assert_eq!(peer_info.instance_id(), instance_id);
assert_eq!(peer_info.worker_id(), instance_id.worker_id());
assert_eq!(peer_info.worker_address(), &address);
}
#[test]
fn test_peer_info_checksum() {
let instance_id = InstanceId::new_v4();
let address = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let peer_info = PeerInfo::new(instance_id, address.clone());
assert_eq!(peer_info.address_checksum(), address.checksum());
}
#[test]
fn test_peer_info_into_address() {
let instance_id = InstanceId::new_v4();
let address = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let peer_info = PeerInfo::new(instance_id, address.clone());
let extracted_address = peer_info.into_address();
assert_eq!(extracted_address, address);
}
#[test]
fn test_peer_info_into_parts() {
let instance_id = InstanceId::new_v4();
let address = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let peer_info = PeerInfo::new(instance_id, address.clone());
let (extracted_id, extracted_address) = peer_info.into_parts();
assert_eq!(extracted_id, instance_id);
assert_eq!(extracted_address, address);
}
#[test]
fn test_peer_info_serde() {
let instance_id = InstanceId::new_v4();
let address = WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"));
let peer_info = PeerInfo::new(instance_id, address);
// Serialize to JSON
let json = serde_json::to_string(&peer_info).unwrap();
// Deserialize back
let deserialized: PeerInfo = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.instance_id(), instance_id);
assert_eq!(deserialized.worker_id(), instance_id.worker_id());
assert_eq!(
deserialized.worker_address().as_bytes(),
b"tcp://127.0.0.1:5555"
);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Identity types for the active message system.
//!
//! This module provides strongly-typed wrappers for instance and worker identifiers:
//! - [`InstanceId`]: Unique runtime instance identifier (wraps UUID)
//! - [`WorkerId`]: Deterministic 64-bit worker identifier derived from InstanceId
//!
//! # Design Principles
//!
//! 1. **Type Safety**: InstanceId cannot be confused with message IDs or other UUIDs
//! 2. **Deterministic Derivation**: WorkerId is always computed from InstanceId (xxh3_64 hash)
//! 3. **Single Source of Truth**: InstanceId is the primary identifier, WorkerId is derived
//!
//! # Example
//!
//! ```ignore
//! // InstanceIds are created internally by the runtime
//! // and obtained from ActiveMessageClient.instance_id()
//!
//! use dynamo_am::api::identity::{InstanceId, WorkerId};
//!
//! # fn get_instance_id() -> InstanceId { unimplemented!() }
//! let instance_id = get_instance_id(); // From ActiveMessageClient
//!
//! // Derive worker ID automatically
//! let worker_id: WorkerId = instance_id.worker_id();
//!
//! // WorkerId is deterministic
//! assert_eq!(worker_id, instance_id.worker_id());
//! ```
use serde::{Deserialize, Serialize};
use std::fmt;
use uuid::Uuid;
use xxhash_rust::xxh3::xxh3_64;
/// Unique identifier for a runtime instance.
///
/// This is a UUID-based identifier that uniquely identifies a running instance
/// of the active message runtime. It is used for:
/// - Transport-level addressing
/// - Discovery registration
/// - Routing table management
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct InstanceId(Uuid);
impl InstanceId {
/// Create a new random v4 InstanceId.
///
/// This is exposed for testing and special cases. In production, use
/// [`InstanceFactory::create()`] instead.
pub fn new_v4() -> Self {
Self(Uuid::new_v4())
}
/// Create an InstanceId from a UUID.
pub fn from_uuid(uuid: Uuid) -> Self {
Self(uuid)
}
/// Create an InstanceId from raw bytes.
pub fn from_bytes(bytes: [u8; 16]) -> Self {
Self(Uuid::from_bytes(bytes))
}
/// Derive the deterministic WorkerId from this InstanceId.
///
/// WorkerId is computed using xxh3_64 hash of the UUID bytes.
/// This ensures a 1:1 mapping between InstanceId and WorkerId.
pub fn worker_id(&self) -> WorkerId {
WorkerId::from(self)
}
/// Get a reference to the underlying UUID.
pub fn as_uuid(&self) -> &Uuid {
&self.0
}
/// Get the underlying UUID as a u128.
pub fn as_u128(&self) -> u128 {
self.0.as_u128()
}
/// Get the underlying UUID as bytes.
pub fn as_bytes(&self) -> &[u8; 16] {
self.0.as_bytes()
}
}
impl fmt::Display for InstanceId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<Uuid> for InstanceId {
fn from(uuid: Uuid) -> Self {
Self(uuid)
}
}
impl From<InstanceId> for Uuid {
fn from(id: InstanceId) -> Self {
id.0
}
}
impl AsRef<Uuid> for InstanceId {
fn as_ref(&self) -> &Uuid {
&self.0
}
}
/// Deterministic 64-bit worker identifier derived from InstanceId.
///
/// WorkerId is used in:
/// - [`crate::event::EventHandle`]: Embedded in the u128 event handle (64 bits)
/// - [`crate::event::EventRoutingTable`]: Maps worker_id → instance_id for event routing
/// - Discovery systems: Lookup key for peer information
///
/// WorkerId is **always derived** from InstanceId using xxh3_64 hash.
/// This ensures consistency across the system without needing to store both values.
///
/// # Example
///
/// ```ignore
/// use dynamo_am::api::identity::{InstanceId, WorkerId};
///
/// # fn get_instance_id() -> InstanceId { unimplemented!() }
/// let instance_id = get_instance_id(); // From ActiveMessageClient
/// let worker_id = instance_id.worker_id();
///
/// // WorkerId is deterministic
/// assert_eq!(worker_id, instance_id.worker_id());
/// ```
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
#[serde(transparent)]
pub struct WorkerId(u64);
impl WorkerId {
/// Create a WorkerId from a raw u64 value.
///
/// This is used when decoding WorkerIds from event handles or wire formats.
/// External users should always derive WorkerId via `instance_id.worker_id()`.
pub fn from_u64(value: u64) -> Self {
Self(value)
}
/// Get the underlying u64 value.
#[inline(always)]
pub fn as_u64(&self) -> u64 {
self.0
}
}
impl fmt::Display for WorkerId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl From<&InstanceId> for WorkerId {
/// Derive WorkerId from InstanceId using xxh3_64 hash.
///
/// This is the canonical way to compute WorkerId - it should never be
/// constructed any other way to ensure consistency.
fn from(id: &InstanceId) -> Self {
Self(xxh3_64(id.as_uuid().as_bytes()))
}
}
impl From<InstanceId> for WorkerId {
fn from(id: InstanceId) -> Self {
Self::from(&id)
}
}
impl From<WorkerId> for u64 {
fn from(id: WorkerId) -> Self {
id.0
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_instance_id_creation() {
let id1 = InstanceId::new_v4();
let id2 = InstanceId::new_v4();
// Different instances have different IDs
assert_ne!(id1, id2);
// Can convert to/from UUID
let uuid: Uuid = id1.into();
let id3 = InstanceId::from(uuid);
assert_eq!(id1, id3);
}
#[test]
fn test_worker_id_deterministic() {
let instance_id = InstanceId::new_v4();
// WorkerId is deterministic
let worker_id1 = instance_id.worker_id();
let worker_id2 = instance_id.worker_id();
assert_eq!(worker_id1, worker_id2);
// Different instances have different worker IDs
let other_instance = InstanceId::new_v4();
let other_worker = other_instance.worker_id();
assert_ne!(worker_id1, other_worker);
}
#[test]
fn test_worker_id_from_conversion() {
let instance_id = InstanceId::new_v4();
// Both From implementations work
let worker_id1 = WorkerId::from(&instance_id);
let worker_id2 = WorkerId::from(instance_id);
assert_eq!(worker_id1, worker_id2);
// Matches .worker_id() method
assert_eq!(worker_id1, instance_id.worker_id());
}
#[test]
fn test_instance_id_display() {
let instance_id = InstanceId::new_v4();
let display = format!("{}", instance_id);
let uuid_display = format!("{}", instance_id.as_uuid());
assert_eq!(display, uuid_display);
}
#[test]
fn test_worker_id_display() {
let instance_id = InstanceId::new_v4();
let worker_id = instance_id.worker_id();
let display = format!("{}", worker_id);
let u64_display = format!("{}", worker_id.as_u64());
assert_eq!(display, u64_display);
}
#[test]
fn test_instance_id_serde() {
let instance_id = InstanceId::new_v4();
// Serialize as JSON
let json = serde_json::to_string(&instance_id).unwrap();
// Should be a plain UUID string
let uuid_json = serde_json::to_string(instance_id.as_uuid()).unwrap();
assert_eq!(json, uuid_json);
// Deserialize back
let deserialized: InstanceId = serde_json::from_str(&json).unwrap();
assert_eq!(instance_id, deserialized);
}
#[test]
fn test_worker_id_serde() {
let worker_id = InstanceId::new_v4().worker_id();
// Serialize as JSON
let json = serde_json::to_string(&worker_id).unwrap();
// Should be a plain u64
let u64_json = serde_json::to_string(&worker_id.as_u64()).unwrap();
assert_eq!(json, u64_json);
// Deserialize back
let deserialized: WorkerId = serde_json::from_str(&json).unwrap();
assert_eq!(worker_id, deserialized);
}
#[test]
fn test_instance_id_as_methods() {
let uuid = Uuid::new_v4();
let instance_id = InstanceId::from_uuid(uuid);
assert_eq!(instance_id.as_uuid(), &uuid);
assert_eq!(instance_id.as_u128(), uuid.as_u128());
assert_eq!(instance_id.as_bytes(), uuid.as_bytes());
}
#[test]
fn test_instance_id_from_bytes() {
let uuid = Uuid::new_v4();
let bytes = *uuid.as_bytes();
let instance_id = InstanceId::from_bytes(bytes);
assert_eq!(instance_id.as_uuid(), &uuid);
}
#[test]
fn test_worker_id_u64_conversion() {
let instance_id = InstanceId::new_v4();
let worker_id = instance_id.worker_id();
let raw_u64 = worker_id.as_u64();
let reconstructed = WorkerId::from_u64(raw_u64);
assert_eq!(worker_id, reconstructed);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use parking_lot::RwLock;
use std::collections::HashMap;
use std::sync::Arc;
use crate::peer::{
DiscoveryError, DiscoveryQueryError, InstanceId, PeerInfo, WorkerAddress, WorkerId,
};
#[derive(Debug, Default, Clone)]
pub struct LocalPeerDiscovery {
inner: Arc<RwLock<LocalPeerDiscoveryInner>>,
}
#[derive(Debug, Default, Clone)]
struct LocalPeerDiscoveryInner {
by_worker_id: HashMap<WorkerId, InstanceId>,
by_instance_id: HashMap<InstanceId, PeerInfo>,
}
impl LocalPeerDiscovery {
pub fn discover_by_worker_id(
&self,
worker_id: WorkerId,
) -> Result<PeerInfo, DiscoveryQueryError> {
let state = self.inner.read();
let by_worker_id = state.by_worker_id.get(&worker_id);
if let Some(instance_id) = by_worker_id {
let peer_info = state.by_instance_id.get(instance_id);
if let Some(peer_info) = peer_info {
return Ok(peer_info.clone());
}
}
Err(DiscoveryQueryError::NotFound)
}
pub fn discover_by_instance_id(
&self,
instance_id: InstanceId,
) -> Result<PeerInfo, DiscoveryQueryError> {
let state = self.inner.read();
let by_instance_id = state.by_instance_id.get(&instance_id);
if let Some(peer_info) = by_instance_id {
return Ok(peer_info.clone());
}
Err(DiscoveryQueryError::NotFound)
}
pub fn register_instance(
&self,
instance_id: InstanceId,
worker_address: WorkerAddress,
) -> Result<(), DiscoveryError> {
let mut state = self.inner.write();
// Validate no worker_id collision
let worker_id = instance_id.worker_id();
if let Some(existing_instance) = state.by_worker_id.get(&worker_id)
&& *existing_instance != instance_id
{
return Err(DiscoveryError::WorkerIdCollision(
worker_id,
*existing_instance,
instance_id,
));
}
// Fail-fast for any duplicate registration attempt
if let Some(existing_peer_info) = state.by_instance_id.get(&instance_id) {
// Check if it's the same address (idempotent attempt) or different
if existing_peer_info.address_checksum() == worker_address.checksum() {
// Duplicate registration with same address - fail to detect bugs
return Err(DiscoveryError::InstanceAlreadyRegistered(instance_id));
} else {
// Re-registration with different address - fail with checksum mismatch
return Err(DiscoveryError::ChecksumMismatch(
instance_id,
existing_peer_info.address_checksum(),
worker_address.checksum(),
));
}
}
// Register peer
let peer_info = PeerInfo::new(instance_id, worker_address);
state.by_worker_id.insert(worker_id, instance_id);
state.by_instance_id.insert(instance_id, peer_info);
Ok(())
}
#[expect(dead_code)]
pub fn unregister_instance(&self, instance_id: InstanceId) -> Result<(), DiscoveryError> {
let mut state = self.inner.write();
state.by_worker_id.remove(&instance_id.worker_id());
state.by_instance_id.remove(&instance_id);
Ok(())
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
mod local;
use anyhow::Result;
use dashmap::DashMap;
use futures::future::{self, BoxFuture, Either, FutureExt, Ready, Shared};
use std::sync::Arc;
use local::LocalPeerDiscovery;
use crate::peer::{DiscoveryQueryError, InstanceId, PeerDiscovery, PeerInfo, WorkerId};
type QueryResult = Result<PeerInfo, DiscoveryQueryError>;
type MaybeAsyncQueryResult = Either<Ready<QueryResult>, Shared<BoxFuture<'static, QueryResult>>>;
/// Cache of shared query futures to deduplicate concurrent remote lookups.
///
/// Shared futures are kept permanently to eliminate race conditions.
/// Memory is bounded by unique peer count, and PeerInfo is cheap to clone.
#[derive(Debug, Default)]
struct PendingQueries {
by_worker_id: DashMap<WorkerId, Shared<BoxFuture<'static, QueryResult>>>,
by_instance_id: DashMap<InstanceId, Shared<BoxFuture<'static, QueryResult>>>,
}
#[derive(Debug)]
pub struct PeerDiscoveryManager {
local: LocalPeerDiscovery,
remotes: Vec<Arc<dyn PeerDiscovery>>,
pending: Arc<PendingQueries>,
}
impl PeerDiscoveryManager {
pub async fn new(
local_peer: Option<PeerInfo>,
sources: Vec<Arc<dyn PeerDiscovery>>,
) -> Result<Self> {
let local = LocalPeerDiscovery::default();
if let Some(local_peer) = &local_peer {
let instance_id = local_peer.instance_id;
let worker_address = local_peer.worker_address.clone();
// register local peer with local discovery
local
.register_instance(instance_id, worker_address.clone())
.map_err(|e| anyhow::anyhow!("Failed to register local peer: {}", e))?;
// register local peer with remote discoveries
for remote in &sources {
remote
.register_instance(instance_id, worker_address.clone())
.await?;
}
}
// TODO: Unregister local peer and remotes when the manager is dropped
// Since drop is not async, we'll need to create a task to unregister the remote instances and
// trigger that task during the drop implementation.
Ok(Self {
local,
remotes: sources,
pending: Arc::new(PendingQueries::default()),
})
}
pub async fn discover_by_worker_id(&self, worker_id: WorkerId) -> MaybeAsyncQueryResult {
// Fast path: check local cache
if let Ok(peer) = self.local.discover_by_worker_id(worker_id) {
return Either::Left(future::ready(Ok(peer)));
}
if self.remotes.is_empty() {
return Either::Left(future::ready(Err(DiscoveryQueryError::NotFound)));
}
// Check if there's already a pending query for this worker_id
if let Some(shared_future) = self.pending.by_worker_id.get(&worker_id) {
return Either::Right(shared_future.clone());
}
// Create a new shared future for this query
let local = self.local.clone();
let remotes = self.remotes.clone();
let pending = self.pending.clone();
use dashmap::mapref::entry::Entry;
let shared_future = match self.pending.by_worker_id.entry(worker_id) {
Entry::Occupied(entry) => {
// Another thread beat us to it, use their future
entry.get().clone()
}
Entry::Vacant(entry) => {
// We're the first, create the shared future
let shared = async move {
// Query remotes sequentially
for remote in &remotes {
match remote.discover_by_worker_id(worker_id).await {
Ok(peer_info) => {
// Cache the result in local store (ignore errors)
if let Err(e) = local.register_instance(
peer_info.instance_id,
peer_info.worker_address.clone(),
) {
tracing::debug!(
"Failed to register peer info in local store: {}",
e
);
}
return Ok(peer_info);
}
Err(DiscoveryQueryError::NotFound) => continue,
Err(e) => {
// Clean up failed future from cache to allow retry
pending.by_worker_id.remove(&worker_id);
return Err(e);
}
}
}
// Clean up NotFound result from cache to allow retry
pending.by_worker_id.remove(&worker_id);
Err(DiscoveryQueryError::NotFound)
}
.boxed()
.shared();
entry.insert(shared.clone());
shared
}
};
Either::Right(shared_future)
}
pub async fn discover_by_instance_id(&self, instance_id: InstanceId) -> MaybeAsyncQueryResult {
// Fast path: check local cache
if let Ok(peer) = self.local.discover_by_instance_id(instance_id) {
return Either::Left(future::ready(Ok(peer)));
}
// Check if there's already a pending query for this instance_id
if let Some(shared_future) = self.pending.by_instance_id.get(&instance_id) {
return Either::Right(shared_future.clone());
}
// Create a new shared future for this query
let local = self.local.clone();
let remotes = self.remotes.clone();
let pending = self.pending.clone();
use dashmap::mapref::entry::Entry;
let shared_future = match self.pending.by_instance_id.entry(instance_id) {
Entry::Occupied(entry) => {
// Another thread beat us to it, use their future
entry.get().clone()
}
Entry::Vacant(entry) => {
// We're the first, create the shared future
let shared = async move {
// Query remotes sequentially
for remote in &remotes {
match remote.discover_by_instance_id(instance_id).await {
Ok(peer_info) => {
// Cache the result in local store (ignore errors)
if let Err(e) = local.register_instance(
peer_info.instance_id,
peer_info.worker_address.clone(),
) {
tracing::debug!(
"Failed to register peer info in local store: {}",
e
);
}
return Ok(peer_info);
}
Err(DiscoveryQueryError::NotFound) => continue,
Err(e) => {
// Clean up failed future from cache to allow retry
pending.by_instance_id.remove(&instance_id);
return Err(e);
}
}
}
// Clean up NotFound result from cache to allow retry
pending.by_instance_id.remove(&instance_id);
Err(DiscoveryQueryError::NotFound)
}
.boxed()
.shared();
entry.insert(shared.clone());
shared
}
};
Either::Right(shared_future)
}
}
#[cfg(test)]
mod tests {
use crate::peer::{DiscoveryError, WorkerAddress};
use super::*;
use bytes::Bytes;
use parking_lot::Mutex as StdMutex;
use std::collections::HashMap;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::time::Duration;
use tokio::sync::{Barrier, Notify};
// Test timeout duration
const TEST_TIMEOUT: Duration = Duration::from_secs(5);
fn make_test_address() -> WorkerAddress {
WorkerAddress::from_bytes(Bytes::from_static(b"tcp://127.0.0.1:5555"))
}
// ============================================================================
// Mock Discovery Infrastructure
// ============================================================================
/// Improved mock discovery with pre-configured responses and proper synchronization.
#[derive(Debug, Clone)]
struct MockDiscovery {
inner: Arc<MockDiscoveryInner>,
}
#[derive(Debug)]
struct MockDiscoveryInner {
// Track call counts
worker_id_calls: AtomicUsize,
instance_id_calls: AtomicUsize,
register_calls: AtomicUsize,
unregister_calls: AtomicUsize,
// Pre-configured responses
worker_responses: StdMutex<HashMap<WorkerId, QueryResult>>,
instance_responses: StdMutex<HashMap<InstanceId, QueryResult>>,
// Notification for test synchronization
worker_call_notify: Arc<Notify>,
instance_call_notify: Arc<Notify>,
// Control whether to return immediately or simulate delay
simulate_delay: AtomicBool,
delay_duration: StdMutex<Duration>,
}
impl MockDiscovery {
fn new() -> Self {
Self {
inner: Arc::new(MockDiscoveryInner {
worker_id_calls: AtomicUsize::new(0),
instance_id_calls: AtomicUsize::new(0),
register_calls: AtomicUsize::new(0),
unregister_calls: AtomicUsize::new(0),
worker_responses: StdMutex::new(HashMap::new()),
instance_responses: StdMutex::new(HashMap::new()),
worker_call_notify: Arc::new(Notify::new()),
instance_call_notify: Arc::new(Notify::new()),
simulate_delay: AtomicBool::new(false),
delay_duration: StdMutex::new(Duration::from_millis(100)),
}),
}
}
/// Set response for a specific worker_id (must be called before query)
fn set_worker_response(&self, worker_id: WorkerId, result: QueryResult) {
self.inner.worker_responses.lock().insert(worker_id, result);
}
/// Set response for a specific instance_id (must be called before query)
fn set_instance_response(&self, instance_id: InstanceId, result: QueryResult) {
self.inner
.instance_responses
.lock()
.insert(instance_id, result);
}
/// Enable simulated delay for responses
fn enable_delay(&self, duration: Duration) {
*self.inner.delay_duration.lock() = duration;
self.inner.simulate_delay.store(true, Ordering::SeqCst);
}
/// Get call counts
fn worker_id_call_count(&self) -> usize {
self.inner.worker_id_calls.load(Ordering::SeqCst)
}
fn instance_id_call_count(&self) -> usize {
self.inner.instance_id_calls.load(Ordering::SeqCst)
}
fn register_call_count(&self) -> usize {
self.inner.register_calls.load(Ordering::SeqCst)
}
#[allow(dead_code)]
fn unregister_call_count(&self) -> usize {
self.inner.unregister_calls.load(Ordering::SeqCst)
}
/// Wait for at least N worker_id queries to be made
#[allow(dead_code)]
async fn wait_for_worker_calls(&self, min_calls: usize) {
loop {
if self.worker_id_call_count() >= min_calls {
return;
}
// Subscribe BEFORE checking again to avoid race
let notified = self.inner.worker_call_notify.notified();
if self.worker_id_call_count() >= min_calls {
return;
}
notified.await;
}
}
/// Wait for at least N instance_id queries to be made
#[allow(dead_code)]
async fn wait_for_instance_calls(&self, min_calls: usize) {
loop {
if self.instance_id_call_count() >= min_calls {
return;
}
// Subscribe BEFORE checking again to avoid race
let notified = self.inner.instance_call_notify.notified();
if self.instance_id_call_count() >= min_calls {
return;
}
notified.await;
}
}
}
impl PeerDiscovery for MockDiscovery {
fn discover_by_worker_id(
&self,
worker_id: WorkerId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
self.inner.worker_id_calls.fetch_add(1, Ordering::SeqCst);
self.inner.worker_call_notify.notify_waiters();
let result = self
.inner
.worker_responses
.lock()
.get(&worker_id)
.cloned()
.unwrap_or(Err(DiscoveryQueryError::NotFound));
let should_delay = self.inner.simulate_delay.load(Ordering::SeqCst);
let delay = *self.inner.delay_duration.lock();
Box::pin(async move {
if should_delay {
tokio::time::sleep(delay).await;
}
result
})
}
fn discover_by_instance_id(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
self.inner.instance_id_calls.fetch_add(1, Ordering::SeqCst);
self.inner.instance_call_notify.notify_waiters();
let result = self
.inner
.instance_responses
.lock()
.get(&instance_id)
.cloned()
.unwrap_or(Err(DiscoveryQueryError::NotFound));
let should_delay = self.inner.simulate_delay.load(Ordering::SeqCst);
let delay = *self.inner.delay_duration.lock();
Box::pin(async move {
if should_delay {
tokio::time::sleep(delay).await;
}
result
})
}
fn register_instance(
&self,
_instance_id: InstanceId,
_worker_address: WorkerAddress,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
self.inner.register_calls.fetch_add(1, Ordering::SeqCst);
Box::pin(async move { Ok(()) })
}
fn unregister_instance(
&self,
_instance_id: InstanceId,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
self.inner.unregister_calls.fetch_add(1, Ordering::SeqCst);
Box::pin(async move { Ok(()) })
}
}
// ============================================================================
// PeerDiscoveryManager Tests
// ============================================================================
#[tokio::test]
async fn test_manager_local_cache_hit() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address.clone());
let manager = PeerDiscoveryManager::new(Some(local_peer.clone()), vec![])
.await
.unwrap();
// Query should hit local cache immediately
let result = manager
.discover_by_worker_id(local_instance.worker_id())
.await;
match result {
Either::Left(ready) => {
let peer = ready.into_inner().unwrap();
assert_eq!(peer.instance_id(), local_instance);
assert_eq!(peer.worker_address(), &local_address);
}
Either::Right(_) => panic!("Expected immediate ready future, got async"),
}
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_no_remotes_returns_not_found() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let manager = PeerDiscoveryManager::new(Some(local_peer), vec![])
.await
.unwrap();
// Query for unknown worker_id with no remotes
let unknown_worker_id = WorkerId::from_u64(999);
let result = manager.discover_by_worker_id(unknown_worker_id).await;
match result {
Either::Left(ready) => {
let err = ready.into_inner().unwrap_err();
assert!(matches!(err, DiscoveryQueryError::NotFound));
}
Either::Right(_) => panic!("Expected immediate not found, got async"),
}
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_remote_query_on_miss() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock = Arc::new(MockDiscovery::new());
let query_worker_id = WorkerId::from_u64(42);
let remote_instance = InstanceId::new_v4();
let remote_address = make_test_address();
let remote_peer = PeerInfo::new(remote_instance, remote_address.clone());
// Pre-configure mock response
mock.set_worker_response(query_worker_id, Ok(remote_peer.clone()));
let manager = PeerDiscoveryManager::new(
Some(local_peer),
vec![mock.clone() as Arc<dyn PeerDiscovery>],
)
.await
.unwrap();
// Query should go to remote
let result = manager.discover_by_worker_id(query_worker_id).await;
match result {
Either::Right(fut) => {
let peer = fut.await.unwrap();
assert_eq!(peer.instance_id(), remote_instance);
assert_eq!(peer.worker_address(), &remote_address);
}
Either::Left(_) => panic!("Expected async future for remote query"),
}
// Verify mock was called
assert_eq!(mock.worker_id_call_count(), 1);
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_concurrent_deduplication_worker_id() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock = Arc::new(MockDiscovery::new());
let query_worker_id = WorkerId::from_u64(42);
let peer_instance = InstanceId::new_v4();
let peer_address = make_test_address();
let peer_info = PeerInfo::new(peer_instance, peer_address);
// Pre-configure mock response with delay
mock.set_worker_response(query_worker_id, Ok(peer_info.clone()));
mock.enable_delay(Duration::from_millis(100));
let manager = Arc::new(
PeerDiscoveryManager::new(
Some(local_peer),
vec![mock.clone() as Arc<dyn PeerDiscovery>],
)
.await
.unwrap(),
);
// Use barrier to synchronize query starts
let barrier = Arc::new(Barrier::new(11)); // 10 queries + main thread
let mut handles = vec![];
for _ in 0..10 {
let mgr = manager.clone();
let bar = barrier.clone();
handles.push(tokio::spawn(async move {
bar.wait().await;
let maybe_async = mgr.discover_by_worker_id(query_worker_id).await;
// Actually await the future to trigger the remote call
match maybe_async {
Either::Right(fut) => Either::Right(fut.await),
Either::Left(ready) => Either::Left(ready.into_inner()),
}
}));
}
// Start all queries simultaneously
barrier.wait().await;
// Give tasks time to start polling the shared future
tokio::time::sleep(Duration::from_millis(50)).await;
// Verify deduplication: only ONE remote call despite 10 concurrent queries
assert_eq!(
mock.worker_id_call_count(),
1,
"Deduplication failed: mock was called more than once"
);
// All 10 queries should eventually succeed with same result
for handle in handles {
let query_result = handle.await.unwrap();
match query_result {
Either::Right(result) => {
let peer = result.unwrap();
assert_eq!(peer.instance_id(), peer_instance);
}
Either::Left(result) => {
let peer = result.unwrap();
assert_eq!(peer.instance_id(), peer_instance);
}
}
}
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_concurrent_deduplication_instance_id() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock = Arc::new(MockDiscovery::new());
let query_instance_id = InstanceId::new_v4();
let peer_address = make_test_address();
let peer_info = PeerInfo::new(query_instance_id, peer_address);
// Pre-configure mock response with delay
mock.set_instance_response(query_instance_id, Ok(peer_info.clone()));
mock.enable_delay(Duration::from_millis(100));
let manager = Arc::new(
PeerDiscoveryManager::new(
Some(local_peer),
vec![mock.clone() as Arc<dyn PeerDiscovery>],
)
.await
.unwrap(),
);
// Use barrier to synchronize
let barrier = Arc::new(Barrier::new(11));
let mut handles = vec![];
for _ in 0..10 {
let mgr = manager.clone();
let bar = barrier.clone();
handles.push(tokio::spawn(async move {
bar.wait().await;
let maybe_async = mgr.discover_by_instance_id(query_instance_id).await;
// Actually await the future to trigger the remote call
match maybe_async {
Either::Right(fut) => Either::Right(fut.await),
Either::Left(ready) => Either::Left(ready.into_inner()),
}
}));
}
barrier.wait().await;
// Give tasks time to start polling the shared future
tokio::time::sleep(Duration::from_millis(50)).await;
// Verify deduplication
assert_eq!(
mock.instance_id_call_count(),
1,
"Deduplication failed for instance_id queries"
);
for handle in handles {
let query_result = handle.await.unwrap();
match query_result {
Either::Right(result) => {
let peer = result.unwrap();
assert_eq!(peer.instance_id(), query_instance_id);
}
Either::Left(result) => {
let peer = result.unwrap();
assert_eq!(peer.instance_id(), query_instance_id);
}
}
}
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_different_ids_independent() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock = Arc::new(MockDiscovery::new());
let worker_id_1 = WorkerId::from_u64(1);
let worker_id_2 = WorkerId::from_u64(2);
let peer1 = PeerInfo::new(InstanceId::new_v4(), make_test_address());
let peer2 = PeerInfo::new(InstanceId::new_v4(), make_test_address());
// Pre-configure responses
mock.set_worker_response(worker_id_1, Ok(peer1.clone()));
mock.set_worker_response(worker_id_2, Ok(peer2.clone()));
let manager = Arc::new(
PeerDiscoveryManager::new(
Some(local_peer),
vec![mock.clone() as Arc<dyn PeerDiscovery>],
)
.await
.unwrap(),
);
// Query both IDs concurrently
let mgr1 = manager.clone();
let mgr2 = manager.clone();
let handle1 =
tokio::spawn(async move { mgr1.discover_by_worker_id(worker_id_1).await });
let handle2 =
tokio::spawn(async move { mgr2.discover_by_worker_id(worker_id_2).await });
let (result1, result2) = tokio::join!(handle1, handle2);
let query1 = result1.unwrap();
let query2 = result2.unwrap();
// Both should be async futures (remote queries)
match (query1, query2) {
(Either::Right(fut1), Either::Right(fut2)) => {
let p1 = fut1.await.unwrap();
let p2 = fut2.await.unwrap();
assert_eq!(p1.instance_id(), peer1.instance_id());
assert_eq!(p2.instance_id(), peer2.instance_id());
assert_ne!(p1.instance_id(), p2.instance_id());
}
_ => panic!("Expected async futures for both queries"),
}
// Each ID should have triggered one call (no cross-deduplication)
assert_eq!(mock.worker_id_call_count(), 2);
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_sequential_remote_fallback() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock1 = Arc::new(MockDiscovery::new());
let mock2 = Arc::new(MockDiscovery::new());
let query_worker_id = WorkerId::from_u64(42);
let peer_info = PeerInfo::new(InstanceId::new_v4(), make_test_address());
// First mock returns NotFound, second succeeds
mock1.set_worker_response(query_worker_id, Err(DiscoveryQueryError::NotFound));
mock2.set_worker_response(query_worker_id, Ok(peer_info.clone()));
let manager = PeerDiscoveryManager::new(
Some(local_peer),
vec![
mock1.clone() as Arc<dyn PeerDiscovery>,
mock2.clone() as Arc<dyn PeerDiscovery>,
],
)
.await
.unwrap();
let result = manager.discover_by_worker_id(query_worker_id).await;
match result {
Either::Right(fut) => {
let peer = fut.await.unwrap();
assert_eq!(peer.instance_id(), peer_info.instance_id());
}
Either::Left(_) => panic!("Expected async future"),
}
// Both mocks should have been called (fallback)
assert_eq!(mock1.worker_id_call_count(), 1);
assert_eq!(mock2.worker_id_call_count(), 1);
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_all_remotes_fail() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock1 = Arc::new(MockDiscovery::new());
let mock2 = Arc::new(MockDiscovery::new());
let query_worker_id = WorkerId::from_u64(42);
// Both mocks return NotFound
mock1.set_worker_response(query_worker_id, Err(DiscoveryQueryError::NotFound));
mock2.set_worker_response(query_worker_id, Err(DiscoveryQueryError::NotFound));
let manager = PeerDiscoveryManager::new(
Some(local_peer),
vec![
mock1.clone() as Arc<dyn PeerDiscovery>,
mock2.clone() as Arc<dyn PeerDiscovery>,
],
)
.await
.unwrap();
let result = manager.discover_by_worker_id(query_worker_id).await;
match result {
Either::Right(fut) => {
let err = fut.await.unwrap_err();
assert!(matches!(err, DiscoveryQueryError::NotFound));
}
Either::Left(_) => panic!("Expected async future"),
}
// Both should have been tried
assert_eq!(mock1.worker_id_call_count(), 1);
assert_eq!(mock2.worker_id_call_count(), 1);
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_cache_population_after_remote_success() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock = Arc::new(MockDiscovery::new());
let query_worker_id = WorkerId::from_u64(42);
let remote_instance = InstanceId::new_v4();
let remote_peer = PeerInfo::new(remote_instance, make_test_address());
mock.set_worker_response(query_worker_id, Ok(remote_peer.clone()));
let manager = Arc::new(
PeerDiscoveryManager::new(
Some(local_peer),
vec![mock.clone() as Arc<dyn PeerDiscovery>],
)
.await
.unwrap(),
);
// First query - goes to remote
let result1 = manager.discover_by_worker_id(query_worker_id).await;
match result1 {
Either::Right(fut) => {
let peer = fut.await.unwrap();
assert_eq!(peer.instance_id(), remote_instance);
}
Either::Left(_) => panic!("Expected async future"),
}
// Give time for caching
tokio::time::sleep(Duration::from_millis(100)).await;
// Second query - should hit local cache OR shared future cache
let result2 = manager.discover_by_worker_id(query_worker_id).await;
match result2 {
Either::Left(ready) => {
// Cache hit!
let peer = ready.into_inner().unwrap();
assert_eq!(peer.instance_id(), remote_instance);
}
Either::Right(fut) => {
// Shared future cache (also valid)
let peer = fut.await.unwrap();
assert_eq!(peer.instance_id(), remote_instance);
}
}
// Mock should have been called only once (not twice)
assert_eq!(mock.worker_id_call_count(), 1);
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
#[tokio::test]
async fn test_manager_register_propagates_to_remotes() {
let result = tokio::time::timeout(TEST_TIMEOUT, async {
let local_instance = InstanceId::new_v4();
let local_address = make_test_address();
let local_peer = PeerInfo::new(local_instance, local_address);
let mock1 = Arc::new(MockDiscovery::new());
let mock2 = Arc::new(MockDiscovery::new());
// Creating the manager already calls register on remotes (for local_peer)
let _manager = PeerDiscoveryManager::new(
Some(local_peer),
vec![
mock1.clone() as Arc<dyn PeerDiscovery>,
mock2.clone() as Arc<dyn PeerDiscovery>,
],
)
.await
.unwrap();
// Both remotes should have received register call
assert_eq!(mock1.register_call_count(), 1);
assert_eq!(mock2.register_call_count(), 1);
})
.await;
assert!(result.is_ok(), "Test timed out after {:?}", TEST_TIMEOUT);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Peer discovery for the Dynamo Active Message system.
use anyhow::Result;
use futures::future::BoxFuture;
use std::fmt;
use std::sync::Arc;
mod address;
mod identity;
mod manager;
pub use address::{PeerInfo, WorkerAddress};
pub use identity::{InstanceId, WorkerId};
pub use manager::PeerDiscoveryManager;
/// Error type for discovery operations.
#[derive(Debug, thiserror::Error)]
pub enum DiscoveryError {
/// Worker ID collision detected - same worker_id registered to different instance
#[error(
"Worker ID collision: worker_id {0} already registered to instance {1}, attempted to register to {2}"
)]
WorkerIdCollision(WorkerId, InstanceId, InstanceId),
/// Address checksum mismatch during re-registration
#[error("Address checksum mismatch for instance {0}: existing=0x{1:016x}, new=0x{2:016x}")]
ChecksumMismatch(InstanceId, u64, u64),
/// Instance already registered - duplicate registration detected
#[error("Instance {0} is already registered")]
InstanceAlreadyRegistered(InstanceId),
/// Backend-specific error
#[error("Backend error: {0}")]
Backend(#[from] anyhow::Error),
}
#[derive(Debug, Clone, thiserror::Error)]
pub enum DiscoveryQueryError {
#[error("Not found")]
NotFound,
#[error("Backend error: {0}")]
Backend(Arc<anyhow::Error>),
}
pub type AwaitableQueryResult = BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>>;
pub type AwaitableRegisterResult = BoxFuture<'static, Result<(), DiscoveryError>>;
/// Trait for discovering [`PeerInfo`] by [`WorkerId`] or [`InstanceId`].
pub trait PeerDiscovery: Send + Sync + fmt::Debug {
/// Lookup peer by worker_id.
fn discover_by_worker_id(&self, worker_id: WorkerId) -> AwaitableQueryResult;
/// Lookup peer by instance_id.
fn discover_by_instance_id(&self, instance_id: InstanceId) -> AwaitableQueryResult;
/// Register this peer in the discovery system.
fn register_instance(
&self,
instance_id: InstanceId,
worker_address: WorkerAddress,
) -> AwaitableRegisterResult;
/// Unregister this peer from the discovery system.
fn unregister_instance(&self, instance_id: InstanceId) -> AwaitableRegisterResult;
}
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use anyhow::{Context, Result, anyhow as error};
use dashmap::DashMap;
use etcd_client::ConnectOptions;
use futures::future::{BoxFuture, FutureExt, Shared};
use parking_lot::RwLock;
use std::{sync::Arc, time::Duration};
use tokio::{sync::Mutex, time::sleep};
/// Type alias for the shared reconnection future
type ReconnectFuture = Shared<BoxFuture<'static, Result<(), Arc<anyhow::Error>>>>;
/// Manages ETCD client connections with reconnection support
#[derive(Clone)]
pub struct Client {
/// The actual ETCD client, protected by RwLock for safe updates during reconnection
/// WARNING: Do not recursively acquire a read lock when the current thread already holds one
client: Arc<RwLock<etcd_client::Client>>,
/// Configuration for connecting to ETCD
etcd_urls: Arc<Vec<String>>,
connect_options: Arc<Option<ConnectOptions>>,
/// Tracks the current backoff duration and last successful connect time
/// The Mutex ensures only one reconnect operation runs at a time
backoff_state: Arc<Mutex<BackoffState>>,
/// Shared reconnection futures for deduplication
/// Only one reconnection happens at a time; concurrent callers share the future
reconnect_pending: Arc<DashMap<(), ReconnectFuture>>,
}
impl Client {
/// Create a new connector with an established connection
pub async fn new(
etcd_urls: Vec<String>,
connect_options: Option<ConnectOptions>,
initial_backoff: Duration,
min_backoff: Duration,
max_backoff: Duration,
) -> Result<Self> {
// Connect to ETCD
let client = Self::connect(&etcd_urls, &connect_options).await?;
Ok(Self {
client: Arc::new(RwLock::new(client)),
etcd_urls: Arc::new(etcd_urls),
connect_options: Arc::new(connect_options),
backoff_state: Arc::new(Mutex::new(BackoffState::new(
initial_backoff,
min_backoff,
max_backoff,
))),
reconnect_pending: Arc::new(DashMap::new()),
})
}
/// Connect to ETCD cluster
async fn connect(
etcd_urls: &[String],
connect_options: &Option<ConnectOptions>,
) -> Result<etcd_client::Client> {
etcd_client::Client::connect(etcd_urls.to_vec(), connect_options.clone())
.await
.with_context(|| {
format!(
"Unable to connect to etcd server at {}. Check etcd server status",
etcd_urls.join(", ")
)
})
}
/// Get a clone of the current ETCD client
pub fn get_client(&self) -> etcd_client::Client {
self.client.read().clone()
}
/// Ensure the client is connected, triggering reconnection if needed.
///
/// This method deduplicates concurrent reconnection attempts - only one
/// reconnection happens at a time, with all callers sharing the same future.
///
/// # Arguments
/// * `deadline` - Deadline for reconnection attempts
/// * `force` - If true, start reconnection even if not already in progress
///
/// Returns Ok(()) if connected, Err if reconnection failed.
pub async fn ensure_connected(&self, deadline: std::time::Instant, force: bool) -> Result<()> {
// Check if reconnection already in progress
if let Some(shared_future_ref) = self.reconnect_pending.get(&()) {
let shared = shared_future_ref.clone();
drop(shared_future_ref); // Release DashMap lock before await
let result = shared.await.map_err(|e| anyhow::anyhow!("{}", e));
if result.is_err() {
// Clean up failed future so subsequent calls can retry
self.reconnect_pending.remove(&());
}
return result;
}
// If not forced, assume we're connected (lightweight path)
if !force {
return Ok(());
}
// Start new reconnection (deduplicated)
use dashmap::mapref::entry::Entry;
let shared_future = match self.reconnect_pending.entry(()) {
Entry::Occupied(entry) => {
// Another thread started reconnection, use their future
entry.get().clone()
}
Entry::Vacant(entry) => {
// We're first, create the shared future
let client = self.clone();
let shared = async move { client.reconnect_impl(deadline).await.map_err(Arc::new) }
.boxed()
.shared();
entry.insert(shared.clone());
shared
}
};
let result = shared_future.await.map_err(|e| anyhow::anyhow!("{}", e));
if result.is_err() {
// Clean up failed future so subsequent calls can retry
self.reconnect_pending.remove(&());
}
result
}
/// Internal implementation of reconnection with retry logic.
/// Respects the deadline and returns error if exceeded.
///
/// Backoff behavior:
/// - Starts at 0 (immediate reconnect) if this is the first reconnect or enough time has passed
/// since the last reconnect
/// - Increments exponentially for continuous failures
/// - Resets to 0 only when: this is a new call AND current_time > last_connect_time + residual_backoff
///
/// The mutex ensures only one reconnect operation runs at a time globally
async fn reconnect_impl(&self, deadline: std::time::Instant) -> Result<()> {
let mut backoff_state = self.backoff_state.lock().await;
tracing::warn!("Reconnecting to ETCD cluster at: {:?}", self.etcd_urls);
backoff_state.attempt_reset();
loop {
backoff_state.apply_backoff(deadline).await;
if std::time::Instant::now() >= deadline {
// Clear the pending reconnection before returning error
self.reconnect_pending.remove(&());
return Err(error!(
"Unable to reconnect to ETCD cluster: deadline exceeded"
));
}
match Self::connect(&self.etcd_urls, &self.connect_options).await {
Ok(new_client) => {
tracing::info!("Successfully reconnected to ETCD cluster");
// Update the client behind the lock
let mut client_guard = self.client.write();
*client_guard = new_client;
// Clear the pending reconnection
self.reconnect_pending.remove(&());
return Ok(());
}
Err(e) => {
tracing::warn!(
"Reconnection failed (remaining time: {:?}): {}",
deadline.saturating_duration_since(std::time::Instant::now()),
e
);
}
}
}
}
/// Get the ETCD URLs
#[allow(dead_code)]
pub fn etcd_urls(&self) -> &[String] {
&self.etcd_urls
}
/// Get the connection options
#[allow(dead_code)]
pub fn connect_options(&self) -> &Option<ConnectOptions> {
&self.connect_options
}
}
#[derive(Debug)]
struct BackoffState {
/// Initial backoff duration for reconnection attempts
pub initial_backoff: Duration,
/// Minimum backoff duration for reconnection attempts
pub min_backoff: Duration,
/// Maximum backoff duration for reconnection attempts
pub max_backoff: Duration,
/// Current backoff duration (starts at 0 for immediate reconnect)
current_backoff: Duration,
/// Last time a connection establishment was attempted
last_connect_attempt: std::time::Instant,
}
impl Default for BackoffState {
fn default() -> Self {
Self {
initial_backoff: Duration::from_millis(500),
min_backoff: Duration::from_millis(50),
max_backoff: Duration::from_secs(5),
current_backoff: Duration::ZERO,
last_connect_attempt: std::time::Instant::now(),
}
}
}
impl BackoffState {
/// Create a new BackoffState with custom parameters.
pub fn new(initial_backoff: Duration, min_backoff: Duration, max_backoff: Duration) -> Self {
Self {
initial_backoff,
min_backoff,
max_backoff,
current_backoff: Duration::ZERO,
last_connect_attempt: std::time::Instant::now(),
}
}
/// Reset backoff to 0 if enough time has passed since the last connection
pub fn attempt_reset(&mut self) {
if std::time::Instant::now() > self.last_connect_attempt + self.current_backoff {
tracing::debug!("Resetting backoff to 0 (first reconnect or enough time has passed)");
self.current_backoff = Duration::ZERO;
}
}
/// Apply backoff and update backoff state for possible next connection attempt
pub async fn apply_backoff(&mut self, deadline: std::time::Instant) {
if self.current_backoff > Duration::ZERO {
let remaining = deadline.saturating_duration_since(std::time::Instant::now());
let backoff = std::cmp::min(self.current_backoff, remaining / 2);
let backoff = std::cmp::min(backoff, self.max_backoff);
let backoff = std::cmp::max(backoff, self.min_backoff);
self.current_backoff = backoff * 2;
tracing::debug!(
"Applying backoff of {:?} (remaining time: {:?})",
backoff,
remaining
);
sleep(backoff).await;
} else {
self.current_backoff = self.initial_backoff;
}
self.last_connect_attempt = std::time::Instant::now();
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Error classification for etcd operations.
//!
//! Categorizes etcd errors into reconnectable, expected, or fatal conditions
//! to enable smart retry logic.
use std::fmt;
use tonic::Code;
/// Errors that indicate a connection issue requiring reconnection.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReconnectableError {
/// Connection to etcd server was closed
ConnectionClosed,
/// Operation timed out
Timeout,
/// Service unavailable (etcd server down or unreachable)
Unavailable,
/// Lease was not found (may have expired during disconnect)
LeaseNotFound,
}
impl fmt::Display for ReconnectableError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::ConnectionClosed => write!(f, "connection closed"),
Self::Timeout => write!(f, "operation timed out"),
Self::Unavailable => write!(f, "service unavailable"),
Self::LeaseNotFound => write!(f, "lease not found"),
}
}
}
/// Classification of etcd errors for determining retry strategy.
#[derive(Debug)]
pub(crate) enum EtcdErrorClass {
/// Error should trigger reconnection and retry
Reconnectable(ReconnectableError),
/// Expected condition (key not found) - not an error
NotFound,
/// Fatal error that cannot be recovered by reconnecting
Fatal(anyhow::Error),
}
/// Classify an etcd error to determine appropriate handling.
///
/// # Classification Strategy
///
/// - **Reconnectable**: Connection/transport errors that can be fixed by reconnecting
/// - **NotFound**: Key doesn't exist (expected condition for queries)
/// - **Fatal**: All other errors (permissions, invalid request, etc.)
pub(crate) fn classify_error(err: etcd_client::Error) -> EtcdErrorClass {
// Use structured error matching instead of fragile string matching
match err {
etcd_client::Error::GRpcStatus(status) => {
// Classify based on gRPC status code
match status.code() {
Code::NotFound => {
// Check if it's a lease not found or key not found
let msg = status.message().to_lowercase();
if msg.contains("lease") {
EtcdErrorClass::Reconnectable(ReconnectableError::LeaseNotFound)
} else {
// Key not found is expected, not an error
EtcdErrorClass::NotFound
}
}
Code::Unavailable => EtcdErrorClass::Reconnectable(ReconnectableError::Unavailable),
Code::DeadlineExceeded => {
EtcdErrorClass::Reconnectable(ReconnectableError::Timeout)
}
Code::Cancelled | Code::Aborted => {
// Connection-related cancellations
EtcdErrorClass::Reconnectable(ReconnectableError::ConnectionClosed)
}
_ => {
// All other gRPC errors are fatal
EtcdErrorClass::Fatal(anyhow::anyhow!(
"gRPC error: {} (code: {:?})",
status.message(),
status.code()
))
}
}
}
etcd_client::Error::TransportError(_) => {
// Transport errors are reconnectable
EtcdErrorClass::Reconnectable(ReconnectableError::Unavailable)
}
etcd_client::Error::IoError(_) => {
// I/O errors are reconnectable
EtcdErrorClass::Reconnectable(ReconnectableError::ConnectionClosed)
}
_ => {
// All other errors (LeaseKeepAliveError, etc.) are fatal
EtcdErrorClass::Fatal(err.into())
}
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Resilient keep-alive task for etcd leases.
//!
//! Handles periodic keep-alive requests to prevent lease expiration,
//! with automatic reconnection and recovery on failure.
use crate::systems::etcd::client::Client;
use crate::systems::etcd::lease::LeaseState;
use anyhow::{Context, Result};
use parking_lot::RwLock;
use std::sync::Arc;
use std::time::Duration;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
/// Background task that keeps an etcd lease alive.
///
/// # Resilience Strategy
///
/// - Acquires client and starts keep-alive stream
/// - Uses stream until failure (does NOT hold client lock)
/// - On failure: triggers reconnection, reacquires client, restarts
/// - Respects shutdown signal for clean termination
pub struct KeepAliveTask {
client: Arc<Client>,
lease_state: Arc<RwLock<LeaseState>>,
ttl: Duration,
shutdown: CancellationToken,
}
impl KeepAliveTask {
/// Create a new keep-alive task.
pub fn new(
client: Arc<Client>,
lease_state: Arc<RwLock<LeaseState>>,
ttl: Duration,
shutdown: CancellationToken,
) -> Self {
Self {
client,
lease_state,
ttl,
shutdown,
}
}
/// Spawn the keep-alive task as a background tokio task.
pub fn spawn(self) -> JoinHandle<()> {
tokio::spawn(async move {
tracing::debug!("Keep-alive task starting");
loop {
// Check for shutdown signal
if self.shutdown.is_cancelled() {
tracing::debug!("Keep-alive task shutting down");
break;
}
// Run keep-alive loop with automatic recovery
if let Err(e) = self.run_keep_alive_loop().await {
tracing::error!("Keep-alive loop failed: {}", e);
// Trigger reconnection before restarting (force=true)
let deadline = std::time::Instant::now() + Duration::from_secs(30);
if let Err(e) = self.client.ensure_connected(deadline, true).await {
tracing::error!("Failed to reconnect after keep-alive failure: {}", e);
// Wait before retry to avoid tight loop
tokio::time::sleep(Duration::from_secs(5)).await;
} else {
tracing::info!("Reconnected successfully, restarting keep-alive");
}
}
}
tracing::debug!("Keep-alive task exited");
})
}
/// Run the keep-alive loop until failure or shutdown.
///
/// # Strategy
///
/// 1. Get lease ID from state
/// 2. Acquire client and start keep-alive stream (brief lock)
/// 3. Release client lock
/// 4. Use keeper/stream handles until they fail
/// 5. On failure, return error (outer loop handles reconnection)
async fn run_keep_alive_loop(&self) -> Result<()> {
// Get current lease ID
let lease_id = self
.lease_state
.read()
.lease_id()
.ok_or_else(|| anyhow::anyhow!("No lease ID available"))?;
tracing::debug!("Starting keep-alive loop for lease {}", lease_id);
// Acquire client and start keep-alive stream (brief lock acquisition)
let mut client = self.client.get_client();
let (mut keeper, mut stream) = client
.lease_keep_alive(lease_id)
.await
.context("Failed to start lease keep-alive stream")?;
// Client lock is released here - we now only use keeper/stream handles
// Calculate sleep interval (TTL / 3, with minimum of 1 second)
let sleep_interval = Duration::from_secs((self.ttl.as_secs() / 3).max(1));
loop {
// Check for messages from the stream
tokio::select! {
// Shutdown signal
_ = self.shutdown.cancelled() => {
tracing::debug!("Keep-alive loop received shutdown signal");
return Ok(());
}
// Keep-alive response from etcd
msg = stream.message() => {
match msg {
Ok(Some(_resp)) => {
tracing::trace!("Received keep-alive response for lease {}", lease_id);
// Successful keep-alive, continue
}
Ok(None) => {
tracing::warn!("Keep-alive stream closed for lease {}", lease_id);
return Err(anyhow::anyhow!("Keep-alive stream closed"));
}
Err(e) => {
tracing::warn!("Keep-alive stream error for lease {}: {}", lease_id, e);
return Err(e.into());
}
}
}
}
// Wait before sending next keep-alive
tokio::select! {
_ = self.shutdown.cancelled() => {
tracing::debug!("Keep-alive loop received shutdown signal during sleep");
return Ok(());
}
_ = tokio::time::sleep(sleep_interval) => {
// Time to send next keep-alive
}
}
// Send keep-alive request
if let Err(e) = keeper.keep_alive().await {
tracing::warn!("Failed to send keep-alive for lease {}: {}", lease_id, e);
return Err(e.into());
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_keep_alive_task_creation() {
// Test that we can create a keep-alive task
// (actual testing requires running etcd instance)
// This is a smoke test to ensure the struct compiles
let ttl = Duration::from_secs(60);
let sleep_interval = (ttl.as_secs() / 3).max(1);
assert_eq!(sleep_interval, 20);
}
#[test]
fn test_sleep_interval_calculation() {
// Test sleep interval calculation
let ttl = Duration::from_secs(60);
let interval = (ttl.as_secs() / 3).max(1);
assert_eq!(interval, 20);
let ttl = Duration::from_secs(10);
let interval = (ttl.as_secs() / 3).max(1);
assert_eq!(interval, 3);
let ttl = Duration::from_secs(2);
let interval = (ttl.as_secs() / 3).max(1);
assert_eq!(interval, 1); // Minimum of 1 second
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Lease management for etcd peer discovery.
//!
//! Handles lease creation, validation, and renewal. Attempts to reuse
//! existing leases when reconnecting to avoid unnecessary re-registration.
//!
//! # Lease Revocation Limitation
//!
//! **IMPORTANT**: If an etcd lease is revoked (either manually or due to
//! network partition), all keys associated with that lease are automatically
//! deleted by etcd. This is an **unrecoverable** state in the current
//! implementation because:
//!
//! 1. The system does not track which keys were published under a lease
//! 2. When a lease is revoked, we create a new lease but cannot republish
//! the deleted keys
//! 3. All peer registrations made with the old lease are permanently lost
//!
//! **Mitigation**: The keep-alive mechanism maintains the lease actively,
//! reducing the chance of expiration. However, extended network partitions
//! or manual lease revocation will result in lost registrations that require
//! application-level re-registration.
use anyhow::{Context, Result};
use std::time::{Duration, Instant};
use tonic::Code;
/// Result of checking lease validity.
///
/// Provides clear information about why a lease is valid or invalid.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum LeaseValidityState {
/// Lease is valid with the specified remaining TTL in seconds
Valid { remaining_ttl: i64 },
/// Lease has expired (TTL <= minimum threshold)
Expired,
/// Lease was not found on the etcd server
NotFound,
/// Failed to check lease validity (network error, etc.)
CheckFailed(String),
}
impl LeaseValidityState {
/// Returns true if the lease is valid and can be reused.
#[allow(dead_code)]
pub fn is_valid(&self) -> bool {
matches!(self, LeaseValidityState::Valid { .. })
}
}
/// State tracking for an etcd lease.
#[derive(Debug)]
pub struct LeaseState {
/// Current lease ID, if one exists
lease_id: Option<i64>,
/// When the lease was created
created_at: Option<Instant>,
/// Lease TTL duration
ttl: Duration,
}
impl LeaseState {
/// Create a new lease state with the specified TTL.
pub fn new(ttl: Duration) -> Self {
Self {
lease_id: None,
created_at: None,
ttl,
}
}
/// Get the current lease ID if one exists.
pub fn lease_id(&self) -> Option<i64> {
self.lease_id
}
/// Get the lease TTL.
#[allow(dead_code)]
pub fn ttl(&self) -> Duration {
self.ttl
}
/// Ensure a valid lease exists, reusing the current one if still valid
/// or creating a new one if expired/not found.
///
/// # Strategy
///
/// 1. If we have a lease ID, check if it's still valid (TTL > 1/3 remaining)
/// 2. If valid, return the existing lease ID
/// 3. If invalid or not found, create a new lease
///
/// This allows us to survive transient disconnections without losing
/// our registrations, while still creating a new lease if needed.
pub async fn ensure_lease(&mut self, client: &mut etcd_client::Client) -> Result<i64> {
// Try to reuse existing lease if it's still valid
if let Some(lease_id) = self.lease_id {
match self.check_lease_validity(client, lease_id).await {
LeaseValidityState::Valid { remaining_ttl } => {
tracing::debug!(
"Reusing existing lease ID: {} (remaining TTL: {}s)",
lease_id,
remaining_ttl
);
return Ok(lease_id);
}
LeaseValidityState::Expired => {
tracing::debug!("Existing lease {} expired, creating new lease", lease_id);
}
LeaseValidityState::NotFound => {
// CRITICAL: When a lease is not found (revoked), all keys associated
// with it are already deleted by etcd. Creating a new lease will NOT
// restore those keys. This is an unrecoverable state - the caller
// must re-register all instances.
tracing::warn!(
"Existing lease {} not found on server (revoked). All keys associated \
with this lease have been deleted. Creating new lease - caller must \
re-register instances.",
lease_id
);
}
LeaseValidityState::CheckFailed(err) => {
tracing::warn!(
"Failed to check lease {} validity: {}, creating new lease",
lease_id,
err
);
}
}
}
// Create new lease
self.create_new_lease(client).await
}
/// Check if a lease is still valid (has > 1/3 of TTL remaining).
///
/// Returns a `LeaseValidityState` that provides clear information about
/// the lease status.
async fn check_lease_validity(
&self,
client: &mut etcd_client::Client,
lease_id: i64,
) -> LeaseValidityState {
// Try to get lease TTL
let resp = match client.lease_time_to_live(lease_id, None).await {
Ok(resp) => resp,
Err(e) => {
// Use structured error matching instead of fragile string matching
match e {
etcd_client::Error::GRpcStatus(status) => {
match status.code() {
Code::NotFound => {
// Lease not found on the server
return LeaseValidityState::NotFound;
}
_ => {
// Other gRPC errors
return LeaseValidityState::CheckFailed(format!(
"gRPC error: {} (code: {:?})",
status.message(),
status.code()
));
}
}
}
_ => {
// Non-gRPC errors (transport, IO, etc.)
return LeaseValidityState::CheckFailed(e.to_string());
}
}
}
};
let remaining_ttl = resp.ttl();
// TTL of 0 or negative means the lease is already gone
if remaining_ttl <= 0 {
return LeaseValidityState::NotFound;
}
// Consider lease valid if it has more than 1/3 of original TTL remaining
let min_ttl = (self.ttl.as_secs() as i64) / 3;
if remaining_ttl > min_ttl {
LeaseValidityState::Valid { remaining_ttl }
} else {
LeaseValidityState::Expired
}
}
/// Create a new lease with the configured TTL.
async fn create_new_lease(&mut self, client: &mut etcd_client::Client) -> Result<i64> {
let ttl_secs = self.ttl.as_secs() as i64;
let resp = client
.lease_grant(ttl_secs, None)
.await
.context("Failed to create new lease")?;
let lease_id = resp.id();
tracing::info!("Created new lease ID: {} (TTL: {}s)", lease_id, ttl_secs);
self.lease_id = Some(lease_id);
self.created_at = Some(Instant::now());
Ok(lease_id)
}
/// Clear the current lease state (e.g., after failed reconnection).
#[allow(dead_code)]
pub fn clear(&mut self) {
self.lease_id = None;
self.created_at = None;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lease_state_creation() {
let ttl = Duration::from_secs(60);
let state = LeaseState::new(ttl);
assert_eq!(state.lease_id(), None);
assert_eq!(state.ttl(), ttl);
}
#[test]
fn test_lease_state_clear() {
let mut state = LeaseState::new(Duration::from_secs(60));
state.lease_id = Some(12345);
state.created_at = Some(Instant::now());
state.clear();
assert_eq!(state.lease_id(), None);
assert_eq!(state.created_at, None);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Etcd-backed peer discovery with TTL and automatic cleanup.
//!
//! This implementation provides centralized discovery using etcd with:
//! - Automatic TTL-based expiration
//! - Heartbeat keep-alive for registration freshness
//! - Transaction-based collision detection
//! - Graceful cleanup on unregister
//!
//! # Example
//!
//! ```no_run
//! use dynamo_am_discovery::etcd::EtcdConfig;
//!
//! # async fn example() -> anyhow::Result<()> {
//! let discovery = EtcdConfigBuilder::default()
//! .cluster_id("my-cluster-peers")
//! .endpoints(vec!["http://localhost:2379".to_string()])
//! .build()
//! .await?;
//!
//! // Use the discovery system
//! // let peer_discovery = discovery.peer_discovery().unwrap();
//! // peer_discovery.register_instance(instance_id, address).await?;
//! # Ok(())
//! # }
//! ```
mod client;
mod error;
mod keep_alive;
mod lease;
mod operations;
mod peer;
use keep_alive::KeepAliveTask;
use lease::LeaseState;
use operations::OperationExecutor;
use peer::EtcdPeerDiscovery;
use anyhow::{Context, Result};
use derive_builder::Builder;
use parking_lot::{Mutex, RwLock};
use std::sync::{Arc, OnceLock};
use std::time::Duration;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use validator::Validate;
use crate::peer::PeerDiscovery;
use super::DiscoverySystem;
/// Validates that a Duration is within the specified range (in seconds).
fn validate_ttl(ttl: &Duration) -> Result<(), validator::ValidationError> {
let secs = ttl.as_secs();
if !(10..=600).contains(&secs) {
return Err(validator::ValidationError::new("ttl_range"));
}
Ok(())
}
/// Configuration for etcd-backed discovery.
///
/// # Example
///
/// ```no_run
/// use dynamo_am_discovery::etcd::EtcdConfig;
/// use std::time::Duration;
///
/// # async fn example() -> anyhow::Result<()> {
/// let system = EtcdConfigBuilder::default()
/// .cluster_id("my-cluster")
/// .endpoints(vec!["http://localhost:2379".to_string()])
/// .ttl(Duration::from_secs(60))
/// .build()
/// .await?;
/// # Ok(())
/// # }
/// ```
#[derive(Debug, Clone, Builder, Validate)]
#[builder(build_fn(private, name = "build_config"), pattern = "owned")]
pub struct EtcdConfig {
/// Cluster ID / key prefix for discovery data (required)
#[builder(setter(into))]
#[validate(custom(function = "super::validation::validate_cluster_id"))]
pub cluster_id: String,
/// Etcd cluster endpoints (e.g., `["http://localhost:2379"]`)
#[builder(default = "vec![\"http://localhost:2379\".to_string()]")]
pub endpoints: Vec<String>,
/// Lease TTL duration (default: 60 seconds, min: 10s, max: 600s)
#[builder(default = "Duration::from_secs(60)")]
#[validate(custom(function = "validate_ttl"))]
pub ttl: Duration,
/// Timeout for individual operations (default: 30 seconds)
#[builder(default = "Duration::from_secs(30)")]
pub operation_timeout: Duration,
/// Maximum number of retries for operations (default: 3)
#[builder(default = "3")]
#[validate(range(min = 0, max = 3))]
pub max_retries: u32,
/// Initial backoff duration for reconnection attempts (default: 500ms)
#[builder(default = "Duration::from_millis(500)")]
pub initial_backoff: Duration,
/// Minimum backoff duration for reconnection attempts (default: 50ms)
#[builder(default = "Duration::from_millis(50)")]
pub min_backoff: Duration,
/// Maximum backoff duration for reconnection attempts (default: 5s)
#[builder(default = "Duration::from_secs(5)")]
pub max_backoff: Duration,
}
/// Extension for EtcdConfigBuilder to provide async build.
impl EtcdConfigBuilder {
/// Build and initialize the etcd discovery system.
///
/// This combines configuration validation and async system initialization into
/// a single call.
///
/// # Returns
///
/// * `Ok(Arc<dyn DiscoverySystem>)` - Successfully connected to etcd
/// * `Err` - Failed to build config or connect to etcd cluster
///
/// # Example
///
/// ```no_run
/// # use dynamo_am_discovery::etcd::EtcdConfig;
/// # #[tokio::main]
/// # async fn main() -> anyhow::Result<()> {
/// let system = EtcdConfigBuilder::default()
/// .cluster_id("my-cluster-peers")
/// .build()
/// .await?;
/// # Ok(())
/// # }
/// ```
pub async fn build(self) -> Result<Arc<dyn DiscoverySystem>, anyhow::Error> {
// Build the config using the private generated method
let config = self
.build_config()
.map_err(|e| anyhow::anyhow!("Failed to build config: {}", e))?;
// Initialize the system
let system = EtcdDiscoverySystem::new(config).await?;
Ok(system)
}
}
/// Private implementation of etcd-backed discovery system.
///
/// Manages connection, lease, keep-alive, and provides PeerDiscovery instances.
struct EtcdDiscoverySystem {
client: Arc<client::Client>,
lease_state: Arc<RwLock<LeaseState>>,
config: EtcdConfig,
keep_alive_handle: Mutex<Option<JoinHandle<()>>>,
shutdown: CancellationToken,
peer_discovery: OnceLock<Arc<dyn PeerDiscovery>>,
}
impl std::fmt::Debug for EtcdDiscoverySystem {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EtcdDiscoverySystem")
.field("cluster_id", &self.config.cluster_id)
.field("endpoints", &self.config.endpoints)
.field("ttl", &self.config.ttl)
.finish()
}
}
impl EtcdDiscoverySystem {
/// Create a new etcd discovery system.
///
/// # Steps
///
/// 1. Connect to etcd cluster
/// 2. Create lease with TTL
/// 3. Start keep-alive task
/// 4. Return system ready for use
#[allow(clippy::await_holding_lock, clippy::new_ret_no_self)]
async fn new(config: EtcdConfig) -> Result<Arc<dyn DiscoverySystem>> {
// Connect to etcd with backoff configuration
let client = Arc::new(
client::Client::new(
config.endpoints.clone(),
None,
config.initial_backoff,
config.min_backoff,
config.max_backoff,
)
.await
.context("Failed to connect to etcd cluster")?,
);
tracing::info!(
"Connected to etcd cluster: {:?}, cluster_id: {}, TTL: {:?}",
config.endpoints,
config.cluster_id,
config.ttl
);
// Initialize lease state
let lease_state = Arc::new(RwLock::new(LeaseState::new(config.ttl)));
// Create and ensure lease
// Note: This is initialization code, no concurrency yet
{
let mut etcd_client = client.get_client();
lease_state
.write()
.ensure_lease(&mut etcd_client)
.await
.context("Failed to create initial lease")?;
}
let shutdown = CancellationToken::new();
let system = Arc::new(Self {
client: client.clone(),
lease_state: lease_state.clone(),
config,
keep_alive_handle: Mutex::new(None),
shutdown: shutdown.clone(),
peer_discovery: OnceLock::new(),
});
// Start keep-alive task
let keep_alive_task = KeepAliveTask::new(client, lease_state, system.config.ttl, shutdown);
let handle = keep_alive_task.spawn();
*system.keep_alive_handle.lock() = Some(handle);
tracing::info!("Etcd discovery system initialized successfully");
Ok(system)
}
}
impl DiscoverySystem for EtcdDiscoverySystem {
fn peer_discovery(&self) -> Option<Arc<dyn PeerDiscovery>> {
Some(
self.peer_discovery
.get_or_init(|| {
let executor = OperationExecutor::new(
self.client.clone(),
self.config.operation_timeout,
self.config.max_retries,
);
let discovery: Arc<dyn PeerDiscovery> = Arc::new(EtcdPeerDiscovery::new(
executor,
self.lease_state.clone(),
self.config.cluster_id.clone(),
));
discovery
})
.clone(),
)
}
fn shutdown(&self) {
tracing::info!("Shutting down EtcdDiscoverySystem");
// Signal shutdown to all tasks
self.shutdown.cancel();
// Abort keep-alive task
if let Some(handle) = self.keep_alive_handle.lock().take() {
handle.abort();
}
tracing::info!("EtcdDiscoverySystem shutdown complete");
}
}
impl Drop for EtcdDiscoverySystem {
fn drop(&mut self) {
// Ensure shutdown is called on drop
self.shutdown();
}
}
#[cfg(all(test, feature = "etcd"))]
mod tests {
use super::*;
use crate::peer::{InstanceId, WorkerAddress};
use crate::systems::DiscoverySystem;
use crate::systems::test_support::{
checksum_validation, collision_detection, not_found_errors,
register_and_discover_by_instance_id, register_and_discover_by_worker_id,
};
use std::sync::Arc;
// Note: These tests require a running etcd instance
//
// Quick start:
// docker run -d -p 2379:2379 --name etcd-test quay.io/coreos/etcd:v3.5.0 \
// /usr/local/bin/etcd --advertise-client-urls http://0.0.0.0:2379 \
// --listen-client-urls http://0.0.0.0:2379
//
// Run tests (enabled by default with 'testing-etcd' feature):
// cargo test --package dynamo-discovery --lib --features etcd
//
// To skip these tests, disable the feature:
// cargo test --package dynamo-discovery --lib --features etcd --no-default-features
/// Helper function to get etcd endpoint for tests
fn etcd_endpoint() -> String {
std::env::var("ETCD_ENDPOINT").unwrap_or_else(|_| "http://127.0.0.1:2379".to_string())
}
fn make_test_address() -> WorkerAddress {
WorkerAddress::from_bytes(b"127.0.0.1:8080".as_slice())
}
fn system_factory(
cluster_id: String,
) -> impl std::future::Future<Output = anyhow::Result<Arc<dyn DiscoverySystem>>> {
let endpoint = etcd_endpoint();
async move {
EtcdConfigBuilder::default()
.cluster_id(cluster_id)
.endpoints(vec![endpoint])
.ttl(Duration::from_secs(30))
.build()
.await
}
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_register_and_discover_by_worker_id() {
register_and_discover_by_worker_id(system_factory)
.await
.expect("worker_id discovery test failed");
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_register_and_discover_by_instance_id() {
register_and_discover_by_instance_id(system_factory)
.await
.expect("instance_id discovery test failed");
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_collision_detection() {
collision_detection(system_factory)
.await
.expect("collision detection test failed");
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_checksum_validation() {
checksum_validation(system_factory)
.await
.expect("checksum validation test failed");
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_not_found_errors() {
not_found_errors(system_factory)
.await
.expect("not found error test failed");
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_unregister_revokes_lease() {
let system = system_factory("test-revoke".to_string())
.await
.expect("Failed to build discovery system");
let peer_discovery = system
.peer_discovery()
.expect("Peer discovery should be available");
let instance_id = InstanceId::new_v4();
let address = make_test_address();
let worker_id = instance_id.worker_id();
peer_discovery
.register_instance(instance_id, address.clone())
.await
.unwrap();
// Verify it's registered
let found = peer_discovery
.discover_by_worker_id(worker_id)
.await
.unwrap();
assert_eq!(found.instance_id(), instance_id);
// Unregister should revoke lease immediately
peer_discovery
.unregister_instance(instance_id)
.await
.unwrap();
// Should no longer be discoverable (no need to wait for TTL)
let result = peer_discovery.discover_by_worker_id(worker_id).await;
assert!(
result.is_err(),
"Unregistered peer should not be discoverable"
);
system.shutdown();
}
#[cfg_attr(not(feature = "testing-etcd"), ignore)]
#[tokio::test]
async fn test_etcd_multiple_discovery_instances() {
// Test that multiple discovery instances can share the same etcd
let cluster_id = "test-shared".to_string();
let system1 = system_factory(cluster_id.clone())
.await
.expect("Failed to build discovery system 1");
let system2 = system_factory(cluster_id)
.await
.expect("Failed to build discovery system 2");
let peer_discovery1 = system1
.peer_discovery()
.expect("Peer discovery 1 should be available");
let peer_discovery2 = system2
.peer_discovery()
.expect("Peer discovery 2 should be available");
let instance_id = InstanceId::new_v4();
let address = make_test_address();
let worker_id = instance_id.worker_id();
// Register on discovery1
peer_discovery1
.register_instance(instance_id, address.clone())
.await
.unwrap();
// Should be visible from discovery2
let found = peer_discovery2
.discover_by_worker_id(worker_id)
.await
.unwrap();
assert_eq!(found.instance_id(), instance_id);
// Cleanup from either instance should work
peer_discovery2
.unregister_instance(instance_id)
.await
.unwrap();
// Should no longer be discoverable
let result = peer_discovery1.discover_by_worker_id(worker_id).await;
assert!(result.is_err());
system1.shutdown();
system2.shutdown();
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Operation execution with automatic retry and reconnection.
//!
//! Wraps etcd operations to handle transient connection failures transparently.
use crate::peer::{DiscoveryError, DiscoveryQueryError};
use crate::systems::etcd::client::Client;
use crate::systems::etcd::error::{EtcdErrorClass, classify_error};
use anyhow::Result;
use futures::future::BoxFuture;
use std::sync::Arc;
use std::time::{Duration, Instant};
/// Executes etcd operations with automatic reconnection on transient errors.
#[derive(Clone)]
pub struct OperationExecutor {
client: Arc<Client>,
default_timeout: Duration,
max_retries: u32,
}
impl OperationExecutor {
/// Create a new operation executor.
pub fn new(client: Arc<Client>, default_timeout: Duration, max_retries: u32) -> Self {
Self {
client,
default_timeout,
max_retries,
}
}
/// Execute a query operation with automatic retry on reconnectable errors.
///
/// # Arguments
///
/// * `op` - Function that performs the etcd operation given a client
///
/// # Returns
///
/// * `Ok(T)` - Operation succeeded
/// * `Err(DiscoveryQueryError::NotFound)` - Key not found (expected)
/// * `Err(DiscoveryQueryError::Backend)` - Fatal error or timeout
///
/// # Behavior
///
/// 1. Acquire client (brief RwLock read)
/// 2. Execute operation
/// 3. On reconnectable error:
/// - Trigger reconnection via `ensure_connected()`
/// - Retry operation
/// 4. On NotFound: return DiscoveryQueryError::NotFound
/// 5. On Fatal error: return DiscoveryQueryError::Backend
pub async fn execute_query<F, T>(&self, op: F) -> Result<T, DiscoveryQueryError>
where
F: Fn(etcd_client::Client) -> BoxFuture<'static, Result<T, etcd_client::Error>>,
{
let deadline = Instant::now() + self.default_timeout;
let mut retry_count = 0;
loop {
// Check deadline
if Instant::now() >= deadline {
return Err(DiscoveryQueryError::Backend(Arc::new(anyhow::anyhow!(
"Operation timed out after {:?}",
self.default_timeout
))));
}
// Await any in-progress reconnection (lightweight check)
if let Err(e) = self.client.ensure_connected(deadline, false).await {
return Err(DiscoveryQueryError::Backend(Arc::new(e)));
}
// Acquire client (brief lock)
let client = self.client.get_client();
// Execute operation
match op(client).await {
Ok(result) => {
return Ok(result);
}
Err(err) => {
// Classify the error to determine action
match classify_error(err) {
EtcdErrorClass::Reconnectable(kind) => {
retry_count += 1;
if retry_count >= self.max_retries {
tracing::error!(
"Max retries ({}) exceeded for reconnectable error: {:?}",
self.max_retries,
kind
);
return Err(DiscoveryQueryError::Backend(Arc::new(
anyhow::anyhow!("Max retries exceeded: {}", kind),
)));
}
tracing::debug!(
"Reconnectable error (attempt {}/{}): {:?}, retrying...",
retry_count,
self.max_retries,
kind
);
// Trigger reconnection (force=true)
if let Err(e) = self.client.ensure_connected(deadline, true).await {
tracing::error!("Failed to reconnect: {}", e);
return Err(DiscoveryQueryError::Backend(Arc::new(e)));
}
// Loop will retry operation
continue;
}
EtcdErrorClass::NotFound => {
return Err(DiscoveryQueryError::NotFound);
}
EtcdErrorClass::Fatal(e) => {
return Err(DiscoveryQueryError::Backend(Arc::new(e)));
}
}
}
}
}
}
/// Execute a write operation (register/unregister) with automatic retry.
///
/// Similar to `execute_query` but returns `DiscoveryError` instead.
pub async fn execute_write<F>(&self, op: F) -> Result<(), DiscoveryError>
where
F: Fn(etcd_client::Client) -> BoxFuture<'static, Result<(), etcd_client::Error>>,
{
let deadline = Instant::now() + self.default_timeout;
let mut retry_count = 0;
loop {
// Check deadline
if Instant::now() >= deadline {
return Err(DiscoveryError::Backend(anyhow::anyhow!(
"Operation timed out after {:?}",
self.default_timeout
)));
}
// Await any in-progress reconnection (lightweight check)
if let Err(e) = self.client.ensure_connected(deadline, false).await {
return Err(DiscoveryError::Backend(e));
}
// Acquire client (brief lock)
let client = self.client.get_client();
// Execute operation
match op(client).await {
Ok(()) => {
return Ok(());
}
Err(err) => {
// Classify the error to determine action
match classify_error(err) {
EtcdErrorClass::Reconnectable(kind) => {
retry_count += 1;
if retry_count >= self.max_retries {
tracing::error!(
"Max retries ({}) exceeded for reconnectable error: {:?}",
self.max_retries,
kind
);
return Err(DiscoveryError::Backend(anyhow::anyhow!(
"Max retries exceeded: {}",
kind
)));
}
tracing::debug!(
"Reconnectable error (attempt {}/{}): {:?}, retrying...",
retry_count,
self.max_retries,
kind
);
// Trigger reconnection (force=true)
if let Err(e) = self.client.ensure_connected(deadline, true).await {
tracing::error!("Failed to reconnect: {}", e);
return Err(DiscoveryError::Backend(e));
}
// Loop will retry operation
continue;
}
EtcdErrorClass::NotFound => {
// For writes, NotFound might be valid (e.g., deleting non-existent key)
// Treat as success
tracing::debug!("Write operation: key not found (treating as success)");
return Ok(());
}
EtcdErrorClass::Fatal(e) => {
return Err(DiscoveryError::Backend(e));
}
}
}
}
}
}
/// Get the underlying client reference.
#[allow(dead_code)]
pub fn client(&self) -> &Arc<Client> {
&self.client
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::sync::Arc;
use crate::peer::{
DiscoveryError, DiscoveryQueryError, InstanceId, PeerDiscovery, PeerInfo, WorkerAddress,
WorkerId,
};
use crate::systems::etcd::lease::LeaseState;
use crate::systems::etcd::operations::OperationExecutor;
use anyhow::{Context, Result};
use etcd_client::{Compare, CompareOp, PutOptions, Txn, TxnOp};
use futures::future::BoxFuture;
use parking_lot::RwLock;
pub(crate) struct EtcdPeerDiscovery {
executor: OperationExecutor,
lease_state: Arc<RwLock<LeaseState>>,
cluster_id: String,
}
impl std::fmt::Debug for EtcdPeerDiscovery {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EtcdPeerDiscovery")
.field("cluster_id", &self.cluster_id)
.finish()
}
}
impl EtcdPeerDiscovery {
pub fn new(
executor: OperationExecutor,
lease_state: Arc<RwLock<LeaseState>>,
cluster_id: String,
) -> Self {
Self {
executor,
lease_state,
cluster_id,
}
}
/// Generate etcd key for worker_id lookup.
fn worker_key(&self, worker_id: WorkerId) -> String {
format!(
"discovery://{}/peer-discovery/by-worker-id/{}",
self.cluster_id,
worker_id.as_u64()
)
}
/// Generate etcd key for instance_id lookup.
fn instance_key(&self, instance_id: InstanceId) -> String {
format!(
"discovery://{}/peer-discovery/by-instance-id/{}",
self.cluster_id, instance_id
)
}
}
impl PeerDiscovery for EtcdPeerDiscovery {
fn discover_by_worker_id(
&self,
worker_id: WorkerId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
let key = self.worker_key(worker_id);
let executor = self.executor.clone();
Box::pin(async move {
executor
.execute_query(|mut client| {
let key = key.clone();
Box::pin(async move {
let resp = client.get(key, None).await?;
let kv = resp.kvs().first().ok_or_else(|| {
etcd_client::Error::from(std::io::Error::new(
std::io::ErrorKind::NotFound,
"key not found",
))
})?;
let value = kv.value().to_vec();
let peer_info: PeerInfo = serde_json::from_slice(&value).map_err(|e| {
etcd_client::Error::from(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("Failed to deserialize PeerInfo: {}", e),
))
})?;
Ok(peer_info)
})
})
.await
})
}
fn discover_by_instance_id(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
let key = self.instance_key(instance_id);
let executor = self.executor.clone();
Box::pin(async move {
executor
.execute_query(|mut client| {
let key = key.clone();
Box::pin(async move {
let resp = client.get(key, None).await?;
let kv = resp.kvs().first().ok_or_else(|| {
etcd_client::Error::from(std::io::Error::new(
std::io::ErrorKind::NotFound,
"key not found",
))
})?;
let value = kv.value().to_vec();
let peer_info: PeerInfo = serde_json::from_slice(&value).map_err(|e| {
etcd_client::Error::from(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("Failed to deserialize PeerInfo: {}", e),
))
})?;
Ok(peer_info)
})
})
.await
})
}
fn register_instance(
&self,
instance_id: InstanceId,
worker_address: WorkerAddress,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
let executor = self.executor.clone();
let worker_id = instance_id.worker_id();
let worker_key = self.worker_key(worker_id);
let instance_key = self.instance_key(instance_id);
let lease_state = self.lease_state.clone();
Box::pin(async move {
// Get current lease ID
let lease_id = lease_state
.read()
.lease_id()
.ok_or_else(|| DiscoveryError::Backend(anyhow::anyhow!("No lease ID available")))?;
// Serialize PeerInfo once
let value = serde_json::to_vec(&PeerInfo::new(instance_id, worker_address))
.context("Failed to serialize PeerInfo")?;
let put_options = PutOptions::new().with_lease(lease_id);
// Atomic registration: both keys must not exist
executor
.execute_write(|mut client| {
let worker_key = worker_key.clone();
let instance_key = instance_key.clone();
let value = value.clone();
let put_options = put_options.clone();
Box::pin(async move {
// Build transaction to ensure atomic registration
let txn = Txn::new()
.when(vec![
// Ensure worker_key doesn't exist (version == 0)
Compare::version(worker_key.clone(), CompareOp::Equal, 0),
// Ensure instance_key doesn't exist (version == 0)
Compare::version(instance_key.clone(), CompareOp::Equal, 0),
])
.and_then(vec![
// If both keys don't exist, write both
TxnOp::put(
worker_key.clone(),
value.clone(),
Some(put_options.clone()),
),
TxnOp::put(instance_key.clone(), value.clone(), Some(put_options)),
]);
// Execute transaction
let result = client.txn(txn).await?;
if result.succeeded() {
Ok(())
} else {
// Transaction failed - one or both keys already exist
// This could be a collision or checksum mismatch
// For now, return a generic error
// TODO: Check if existing values match (idempotent registration)
Err(etcd_client::Error::from(std::io::Error::new(
std::io::ErrorKind::AlreadyExists,
"Worker ID or Instance ID already registered",
)))
}
})
})
.await
})
}
fn unregister_instance(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
let executor = self.executor.clone();
let worker_id = instance_id.worker_id();
let worker_key = self.worker_key(worker_id);
let instance_key = self.instance_key(instance_id);
Box::pin(async move {
// Delete both keys (not atomic, but that's okay for unregister)
executor
.execute_write(|mut client| {
let worker_key = worker_key.clone();
let instance_key = instance_key.clone();
Box::pin(async move {
// Delete worker key
client.delete(worker_key, None).await?;
// Delete instance key
client.delete(instance_key, None).await?;
Ok(())
})
})
.await
})
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#[cfg(feature = "etcd")]
mod etcd;
#[cfg(feature = "etcd")]
pub use etcd::{EtcdConfig, EtcdConfigBuilder};
#[cfg(feature = "p2p")]
mod p2p;
#[cfg(feature = "p2p")]
pub use p2p::{P2pConfig, P2pConfigBuilder};
#[cfg(test)]
pub(crate) mod test_support;
mod validation;
use std::sync::Arc;
use futures::future::BoxFuture;
pub use crate::peer::PeerDiscovery;
use crate::peer::{
DiscoveryError, DiscoveryQueryError, InstanceId, PeerInfo, WorkerAddress, WorkerId,
};
/// Validates cluster ID format.
///
/// Cluster IDs must contain only:
/// - Lowercase letters (a-z)
/// - Numbers (0-9)
/// - Hyphens (-)
/// - Underscores (_)
///
/// No uppercase, spaces, slashes, or special characters allowed.
///
/// # Errors
///
/// Returns a validation error if:
/// - The cluster_id is empty
/// - The cluster_id contains invalid characters
pub fn validate_cluster_id(cluster_id: &str) -> Result<(), validator::ValidationError> {
if cluster_id.is_empty() {
return Err(validator::ValidationError::new("cluster_id_empty"));
}
for ch in cluster_id.chars() {
if !matches!(ch, 'a'..='z' | '0'..='9' | '-' | '_') {
return Err(validator::ValidationError::new("cluster_id_invalid_chars"));
}
}
Ok(())
}
/// A [`DiscoverySystem`] should provide one or more concrete implementations of discovery traits in this crate.
pub trait DiscoverySystem: Send + Sync + std::fmt::Debug {
/// Returns a [`PeerDiscoveryExt`] implementation if available.
fn peer_discovery(&self) -> Option<Arc<dyn PeerDiscovery>>;
/// Gracefully shutdown the discovery system.
///
/// This should stop background tasks (like keep-alive), close connections,
/// and clean up resources. Implementations should make this idempotent.
///
/// Default implementation does nothing (no-op).
fn shutdown(&self) {
// Default no-op for implementations that don't need explicit shutdown
}
}
/// Attach a [`DiscoverySystem`] to its peer discovery implementation while keeping the system alive.
#[allow(dead_code)]
pub(crate) fn peer_discovery_handle(
system: Arc<dyn DiscoverySystem>,
) -> Option<Arc<dyn PeerDiscovery>> {
system.peer_discovery().map(|inner| {
Arc::new(SystemBackedPeerDiscovery::new(system, inner)) as Arc<dyn PeerDiscovery>
})
}
#[derive(Clone)]
#[allow(dead_code)]
struct SystemBackedPeerDiscovery {
system: Arc<dyn DiscoverySystem>,
inner: Arc<dyn PeerDiscovery>,
}
impl SystemBackedPeerDiscovery {
fn new(system: Arc<dyn DiscoverySystem>, inner: Arc<dyn PeerDiscovery>) -> Self {
Self { system, inner }
}
}
impl std::fmt::Debug for SystemBackedPeerDiscovery {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SystemBackedPeerDiscovery").finish()
}
}
impl PeerDiscovery for SystemBackedPeerDiscovery {
fn discover_by_worker_id(
&self,
worker_id: WorkerId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
self.inner.discover_by_worker_id(worker_id)
}
fn discover_by_instance_id(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
self.inner.discover_by_instance_id(instance_id)
}
fn register_instance(
&self,
instance_id: InstanceId,
worker_address: WorkerAddress,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
self.inner.register_instance(instance_id, worker_address)
}
fn unregister_instance(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
self.inner.unregister_instance(instance_id)
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Libp2p-backed peer discovery system mirroring the etcd system interface.
//!
//! This implementation wraps the legacy `p2p` discovery backend in the shared
//! [`DiscoverySystem`] abstraction so callers can type-erase the runtime and
//! request concrete discovery capabilities on demand.
mod swarm;
use anyhow::Result;
use derive_builder::Builder;
use std::sync::Arc;
use validator::Validate;
use crate::peer::PeerDiscovery;
use super::DiscoverySystem;
const DEFAULT_LISTEN_PORT: u16 = 0;
const DEFAULT_REPLICATION_FACTOR: usize = 3;
const DEFAULT_RECORD_TTL_SECS: u64 = 600;
/// Configuration for libp2p-based discovery.
///
/// # Example
///
/// ```no_run
/// use dynamo_am_discovery::systems::P2pConfig;
///
/// # async fn example() -> anyhow::Result<()> {
/// let system = P2pConfig::builder()
/// .cluster_id("my-cluster")
/// .enable_mdns(true)
/// .build()
/// .await?;
///
/// let peer_discovery = system
/// .peer_discovery()
/// .expect("p2p system always provides peer discovery");
/// # Ok(())
/// # }
/// ```
#[derive(Debug, Clone, Builder, Validate)]
#[builder(pattern = "owned", build_fn(private, name = "build_config"))]
pub struct P2pConfig {
/// Cluster ID / swarm key for private network admission (required)
#[builder(setter(into))]
#[validate(custom(function = "super::validation::validate_cluster_id"))]
pub cluster_id: String,
/// Port to listen on for incoming connections (default: 0 = random)
#[builder(default = "DEFAULT_LISTEN_PORT")]
pub listen_port: u16,
/// Bootstrap peer addresses (format: "host:port" or Multiaddr strings)
#[builder(default = "Vec::new()")]
pub bootstrap_peers: Vec<String>,
/// DHT replication factor (default: 3)
#[builder(default = "DEFAULT_REPLICATION_FACTOR")]
pub replication_factor: usize,
/// Enable mDNS for local network discovery (default: false)
#[builder(default = "false")]
pub enable_mdns: bool,
/// Record TTL in seconds (default: 600)
#[builder(default = "DEFAULT_RECORD_TTL_SECS")]
pub record_ttl_secs: u64,
/// Publication interval in seconds (default: ttl / 2)
#[builder(default = "None")]
pub publication_interval_secs: Option<u64>,
/// Provider publication interval in seconds (default: ttl / 2)
#[builder(default = "None")]
pub provider_publication_interval_secs: Option<u64>,
}
impl P2pConfigBuilder {
/// Build and initialize the P2P discovery system.
pub async fn build(self) -> Result<Arc<dyn DiscoverySystem>, anyhow::Error> {
let mut config = self
.build_config()
.map_err(|e| anyhow::anyhow!("Failed to build config: {e}"))?;
// Default heartbeat intervals to half the TTL to keep records alive.
let default_interval = (config.record_ttl_secs / 2).max(1);
config
.publication_interval_secs
.get_or_insert(default_interval);
config
.provider_publication_interval_secs
.get_or_insert(default_interval);
P2pDiscoverySystem::from_config(config).await
}
}
struct P2pDiscoverySystem {
config: P2pConfig,
peer_discovery: Arc<swarm::P2pDiscovery>,
}
impl std::fmt::Debug for P2pDiscoverySystem {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("P2pDiscoverySystem")
.field("cluster_id", &self.config.cluster_id)
.field("listen_port", &self.config.listen_port)
.field("bootstrap_peers", &self.config.bootstrap_peers)
.finish()
}
}
impl P2pDiscoverySystem {
async fn from_config(config: P2pConfig) -> Result<Arc<dyn DiscoverySystem>, anyhow::Error> {
let peer_discovery = Arc::new(
swarm::P2pDiscovery::new(
config.cluster_id.clone(),
config.listen_port,
config.bootstrap_peers.clone(),
config.replication_factor,
config.enable_mdns,
config.record_ttl_secs,
config.publication_interval_secs,
config.provider_publication_interval_secs,
)
.await?,
);
Ok(Arc::new(Self {
config,
peer_discovery,
}))
}
}
impl DiscoverySystem for P2pDiscoverySystem {
fn peer_discovery(&self) -> Option<Arc<dyn PeerDiscovery>> {
let discovery: Arc<dyn PeerDiscovery> = self.peer_discovery.clone();
Some(discovery)
}
fn shutdown(&self) {
tracing::info!("Shutting down P2pDiscoverySystem");
self.peer_discovery.shutdown();
}
}
impl Drop for P2pDiscoverySystem {
fn drop(&mut self) {
self.shutdown();
}
}
#[cfg(all(test, feature = "p2p"))]
mod tests {
use super::*;
use crate::peer::{InstanceId, WorkerAddress};
use crate::systems::test_support::{
checksum_validation, collision_detection, not_found_errors,
register_and_discover_by_instance_id, register_and_discover_by_worker_id,
};
use crate::systems::{DiscoveryQueryError, DiscoverySystem, peer_discovery_handle};
use std::sync::Arc;
fn system_factory(
cluster_id: String,
) -> impl std::future::Future<Output = anyhow::Result<Arc<dyn DiscoverySystem>>> {
async move {
P2pConfigBuilder::default()
.cluster_id(cluster_id)
.listen_port(DEFAULT_LISTEN_PORT)
.build()
.await
}
}
#[tokio::test]
async fn test_p2p_register_and_discover_by_worker_id() {
register_and_discover_by_worker_id(system_factory)
.await
.expect("worker_id discovery test failed");
}
#[tokio::test]
async fn test_p2p_register_and_discover_by_instance_id() {
register_and_discover_by_instance_id(system_factory)
.await
.expect("instance_id discovery test failed");
}
#[tokio::test]
async fn test_p2p_collision_detection() {
collision_detection(system_factory)
.await
.expect("collision detection test failed");
}
#[tokio::test]
async fn test_p2p_checksum_validation() {
checksum_validation(system_factory)
.await
.expect("checksum validation test failed");
}
#[tokio::test]
async fn test_p2p_not_found_errors() {
not_found_errors(system_factory)
.await
.expect("not found error test failed");
}
#[tokio::test]
async fn test_p2p_unregister_marks_tombstone() {
let system = system_factory("test-unregister".to_string())
.await
.expect("Failed to build discovery system");
let discovery =
peer_discovery_handle(Arc::clone(&system)).expect("Peer discovery should be available");
let instance_id = InstanceId::new_v4();
let address = WorkerAddress::from_bytes(b"127.0.0.1:9000".as_slice());
let worker_id = instance_id.worker_id();
discovery
.register_instance(instance_id, address)
.await
.expect("registration should succeed");
discovery
.unregister_instance(instance_id)
.await
.expect("unregister should publish tombstone");
let result = discovery.discover_by_worker_id(worker_id).await;
assert!(matches!(result, Err(DiscoveryQueryError::NotFound)));
system.shutdown();
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! P2P discovery implementation using libp2p Kademlia DHT.
use anyhow::{Context, Result, anyhow};
use futures::{StreamExt, future::BoxFuture};
use libp2p::{
Multiaddr, PeerId, StreamProtocol, Transport,
core::upgrade,
identity, noise,
pnet::{PnetConfig, PreSharedKey},
swarm::{NetworkBehaviour, Swarm, SwarmEvent},
tcp, yamux,
};
use libp2p_kad::{
Behaviour as Kademlia, Config as KademliaConfig, Event as KademliaEvent, Mode, QueryResult,
Quorum, Record, RecordKey, store::MemoryStore,
};
use libp2p_mdns as mdns;
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{RwLock, mpsc, oneshot};
use tracing::{debug, info, warn};
use crate::peer::{
DiscoveryError, DiscoveryQueryError, InstanceId, PeerDiscovery, PeerInfo, WorkerAddress,
WorkerId,
};
/// Dynamo Kademlia protocol name.
const DYNAMO_KAD_PROTOCOL: &str = "/dynamo/kad/1.0.0";
/// Generate Pre-Shared Key from a cluster_id string.
pub fn generate_psk_from_cluster_id(cluster_id: &str) -> PreSharedKey {
use blake2::{Blake2b512, Digest};
let mut hasher = Blake2b512::new();
hasher.update(cluster_id.as_bytes());
let hash = hasher.finalize();
let mut psk_bytes = [0u8; 32];
psk_bytes.copy_from_slice(&hash[..32]);
PreSharedKey::new(psk_bytes)
}
/// Helper error type for DHT get operations.
#[derive(Debug)]
enum GetRecordError {
NotFound,
Backend(anyhow::Error),
}
impl From<GetRecordError> for DiscoveryQueryError {
fn from(err: GetRecordError) -> Self {
match err {
GetRecordError::NotFound => DiscoveryQueryError::NotFound,
GetRecordError::Backend(err) => DiscoveryQueryError::Backend(Arc::new(err)),
}
}
}
/// Network behaviour combining Kademlia DHT and mDNS.
#[derive(NetworkBehaviour)]
#[behaviour(to_swarm = "DynamoBehaviourEvent")]
struct DynamoBehaviour {
kad: Kademlia<MemoryStore>,
mdns: libp2p::swarm::behaviour::toggle::Toggle<mdns::tokio::Behaviour>,
}
#[derive(Debug)]
#[allow(clippy::large_enum_variant)]
enum DynamoBehaviourEvent {
Kad(KademliaEvent),
Mdns(mdns::Event),
}
impl From<KademliaEvent> for DynamoBehaviourEvent {
fn from(event: KademliaEvent) -> Self {
DynamoBehaviourEvent::Kad(event)
}
}
impl From<mdns::Event> for DynamoBehaviourEvent {
fn from(event: mdns::Event) -> Self {
DynamoBehaviourEvent::Mdns(event)
}
}
type ProviderList = Vec<(PeerId, Vec<Multiaddr>)>;
enum SwarmCommand {
PutRecord {
key: RecordKey,
value: Vec<u8>,
reply: oneshot::Sender<Result<()>>,
},
GetRecord {
key: RecordKey,
reply: oneshot::Sender<Result<Vec<u8>, GetRecordError>>,
},
#[allow(dead_code)]
StartProviding {
key: RecordKey,
reply: oneshot::Sender<Result<()>>,
},
#[allow(dead_code)]
GetProviders {
key: RecordKey,
reply: oneshot::Sender<Result<ProviderList>>,
},
#[allow(dead_code)]
Shutdown,
}
type PendingGetQueries =
Arc<RwLock<HashMap<libp2p_kad::QueryId, oneshot::Sender<Result<Vec<u8>, GetRecordError>>>>>;
type PendingProviderQueries =
Arc<RwLock<HashMap<libp2p_kad::QueryId, oneshot::Sender<Result<ProviderList>>>>>;
#[derive(Clone)]
pub(super) struct P2pDiscovery {
local_peer_id: PeerId,
command_tx: mpsc::Sender<SwarmCommand>,
#[allow(dead_code)]
pending_get_queries: PendingGetQueries,
#[allow(dead_code)]
pending_provider_queries: PendingProviderQueries,
}
impl std::fmt::Debug for P2pDiscovery {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("P2pDiscovery")
.field("local_peer_id", &self.local_peer_id)
.finish()
}
}
impl P2pDiscovery {
fn decode_peer(value: &[u8]) -> Result<PeerInfo, GetRecordError> {
if value.is_empty() {
return Err(GetRecordError::NotFound);
}
serde_json::from_slice(value).map_err(|err| GetRecordError::Backend(err.into()))
}
fn create_behaviour(
key: &identity::Keypair,
replication_factor: usize,
enable_mdns: bool,
record_ttl_secs: u64,
publication_interval_secs: Option<u64>,
provider_publication_interval_secs: Option<u64>,
) -> DynamoBehaviour {
let local_peer_id = key.public().to_peer_id();
let store = MemoryStore::new(local_peer_id);
let protocol = StreamProtocol::try_from_owned(DYNAMO_KAD_PROTOCOL.to_string())
.expect("Valid protocol name");
let mut kad_config = KademliaConfig::new(protocol);
kad_config
.set_replication_factor(
NonZeroUsize::new(replication_factor).expect("Replication factor must be non-zero"),
)
.set_parallelism(NonZeroUsize::new(10).unwrap())
.set_query_timeout(Duration::from_secs(30))
.set_publication_interval(publication_interval_secs.map(Duration::from_secs))
.set_provider_publication_interval(
provider_publication_interval_secs.map(Duration::from_secs),
)
.set_record_ttl(Some(Duration::from_secs(record_ttl_secs)))
.set_provider_record_ttl(Some(Duration::from_secs(record_ttl_secs)));
let mut kad = Kademlia::with_config(local_peer_id, store, kad_config);
kad.set_mode(Some(Mode::Server));
// Conditionally enable mDNS based on configuration
let mdns = if enable_mdns {
let behaviour = mdns::tokio::Behaviour::new(mdns::Config::default(), local_peer_id)
.expect("Failed to create mDNS behaviour");
libp2p::swarm::behaviour::toggle::Toggle::from(Some(behaviour))
} else {
libp2p::swarm::behaviour::toggle::Toggle::from(None)
};
DynamoBehaviour { kad, mdns }
}
#[allow(clippy::too_many_arguments)]
pub(super) async fn new(
cluster_id: String,
listen_port: u16,
bootstrap_peers: Vec<String>,
replication_factor: usize,
enable_mdns: bool,
record_ttl_secs: u64,
publication_interval_secs: Option<u64>,
provider_publication_interval_secs: Option<u64>,
) -> Result<Self> {
let keypair = identity::Keypair::generate_ed25519();
let local_peer_id = keypair.public().to_peer_id();
info!(
"Initializing P2P discovery for peer {} with cluster_id '{}'",
local_peer_id, cluster_id
);
let psk = generate_psk_from_cluster_id(&cluster_id);
let mut swarm = libp2p::SwarmBuilder::with_existing_identity(keypair)
.with_tokio()
.with_other_transport(move |key| {
let tcp = tcp::tokio::Transport::default();
let pnet_tcp = tcp.and_then(move |socket, _| {
let psk_clone = psk;
async move { PnetConfig::new(psk_clone).handshake(socket).await }
});
pnet_tcp
.upgrade(upgrade::Version::V1)
.authenticate(
noise::Config::new(key)
.expect("Failed to create noise config with valid keypair"),
)
.multiplex(yamux::Config::default())
.boxed()
})?
.with_behaviour(|key| {
Self::create_behaviour(
key,
replication_factor,
enable_mdns,
record_ttl_secs,
publication_interval_secs,
provider_publication_interval_secs,
)
})?
.build();
if enable_mdns {
info!("mDNS enabled for local peer discovery");
}
let listen_addr: Multiaddr = format!("/ip4/0.0.0.0/tcp/{}", listen_port)
.parse()
.context("Invalid listen address")?;
swarm
.listen_on(listen_addr.clone())
.with_context(|| format!("Failed to listen on {}", listen_addr))?;
info!("Listening on {}", listen_addr);
for peer_str in &bootstrap_peers {
let addr: Multiaddr = format!(
"/ip4/{}/tcp/{}",
peer_str.split(':').next().unwrap_or("127.0.0.1"),
peer_str.split(':').nth(1).unwrap_or("4001")
)
.parse()
.with_context(|| format!("Invalid bootstrap peer address: {}", peer_str))?;
if let Err(e) = swarm.dial(addr.clone()) {
warn!("Failed to dial bootstrap peer {}: {:?}", peer_str, e);
} else {
info!("Dialing bootstrap peer at {}", addr);
}
}
if !bootstrap_peers.is_empty() {
if let Err(e) = swarm.behaviour_mut().kad.bootstrap() {
warn!("Failed to bootstrap Kademlia DHT: {:?}", e);
} else {
info!("Started DHT bootstrap");
}
}
let (command_tx, command_rx) = mpsc::channel(100);
let pending_get_queries = Arc::new(RwLock::new(HashMap::new()));
let pending_provider_queries = Arc::new(RwLock::new(HashMap::new()));
let pending_get_queries_clone = Arc::clone(&pending_get_queries);
let pending_provider_queries_clone = Arc::clone(&pending_provider_queries);
tokio::spawn(async move {
Self::swarm_event_loop(
swarm,
command_rx,
pending_get_queries_clone,
pending_provider_queries_clone,
)
.await;
});
Ok(Self {
local_peer_id,
command_tx,
pending_get_queries,
pending_provider_queries,
})
}
async fn swarm_event_loop(
mut swarm: Swarm<DynamoBehaviour>,
mut command_rx: mpsc::Receiver<SwarmCommand>,
pending_get_queries: PendingGetQueries,
pending_provider_queries: PendingProviderQueries,
) {
loop {
tokio::select! {
Some(cmd) = command_rx.recv() => {
match cmd {
SwarmCommand::PutRecord { key, value, reply } => {
let record = Record {
key,
value,
publisher: None,
expires: None,
};
match swarm.behaviour_mut().kad.put_record(record, Quorum::One) {
Ok(_) => {
let _ = reply.send(Ok(()));
}
Err(e) => {
let _ = reply.send(Err(anyhow!("Failed to put record: {:?}", e)));
}
}
}
SwarmCommand::GetRecord { key, reply } => {
let query_id = swarm.behaviour_mut().kad.get_record(key);
pending_get_queries.write().await.insert(query_id, reply);
}
SwarmCommand::StartProviding { key, reply } => {
match swarm.behaviour_mut().kad.start_providing(key) {
Ok(_) => {
let _ = reply.send(Ok(()));
}
Err(e) => {
let _ = reply.send(Err(anyhow!("Failed to start providing: {:?}", e)));
}
}
}
SwarmCommand::GetProviders { key, reply } => {
let query_id = swarm.behaviour_mut().kad.get_providers(key);
pending_provider_queries.write().await.insert(query_id, reply);
}
SwarmCommand::Shutdown => {
info!("Shutting down P2P swarm");
break;
}
}
}
event = swarm.select_next_some() => {
match event {
SwarmEvent::Behaviour(DynamoBehaviourEvent::Kad(kad_event)) => {
Self::handle_kad_event(
kad_event,
&pending_get_queries,
&pending_provider_queries,
).await;
}
SwarmEvent::Behaviour(DynamoBehaviourEvent::Mdns(mdns_event)) => {
Self::handle_mdns_event(mdns_event, &mut swarm);
}
SwarmEvent::NewListenAddr { address, .. } => {
info!("Listening on {}", address);
}
SwarmEvent::ConnectionEstablished { peer_id, .. } => {
debug!("Connection established with peer {}", peer_id);
}
SwarmEvent::ConnectionClosed { peer_id, cause, .. } => {
debug!("Connection closed with peer {}: {:?}", peer_id, cause);
}
_ => {}
}
}
}
}
}
async fn handle_kad_event(
event: KademliaEvent,
pending_get_queries: &PendingGetQueries,
pending_provider_queries: &PendingProviderQueries,
) {
match event {
KademliaEvent::OutboundQueryProgressed {
id,
result: QueryResult::GetRecord(Ok(libp2p_kad::GetRecordOk::FoundRecord(record))),
..
} => {
if let Some(sender) = pending_get_queries.write().await.remove(&id) {
let _ = sender.send(Ok(record.record.value.clone()));
}
}
KademliaEvent::OutboundQueryProgressed {
id,
result: QueryResult::GetRecord(Err(err)),
..
} => {
if let Some(sender) = pending_get_queries.write().await.remove(&id) {
let mapped = match err {
libp2p_kad::GetRecordError::NotFound { .. } => {
Err(GetRecordError::NotFound)
}
other => Err(GetRecordError::Backend(anyhow!(
"Get record failed: {:?}",
other
))),
};
let _ = sender.send(mapped);
}
}
KademliaEvent::OutboundQueryProgressed {
id,
result:
QueryResult::GetProviders(Ok(libp2p_kad::GetProvidersOk::FoundProviders {
providers,
..
})),
..
} => {
let provider_addrs: Vec<(PeerId, Vec<Multiaddr>)> = providers
.into_iter()
.map(|peer_id| (peer_id, Vec::new()))
.collect();
if let Some(sender) = pending_provider_queries.write().await.remove(&id) {
let _ = sender.send(Ok(provider_addrs));
}
}
KademliaEvent::OutboundQueryProgressed {
id,
result: QueryResult::GetProviders(Err(e)),
..
} => {
if let Some(sender) = pending_provider_queries.write().await.remove(&id) {
let _ = sender.send(Err(anyhow!("Get providers failed: {:?}", e)));
}
}
KademliaEvent::OutboundQueryProgressed {
result: QueryResult::Bootstrap(Ok(_)),
..
} => {
info!("Kademlia bootstrap completed successfully");
}
KademliaEvent::OutboundQueryProgressed {
result: QueryResult::Bootstrap(Err(e)),
..
} => {
warn!("Kademlia bootstrap failed: {:?}", e);
}
KademliaEvent::RoutingUpdated { peer, .. } => {
debug!("Routing table updated with peer {}", peer);
}
_ => {}
}
}
fn handle_mdns_event(event: mdns::Event, swarm: &mut Swarm<DynamoBehaviour>) {
match event {
mdns::Event::Discovered(peers) => {
for (peer_id, addr) in peers {
debug!("mDNS discovered peer {} at {}", peer_id, addr);
swarm.behaviour_mut().kad.add_address(&peer_id, addr);
}
}
mdns::Event::Expired(peers) => {
for (peer_id, _addr) in peers {
debug!("mDNS expired peer {}", peer_id);
}
}
}
}
async fn put_record(&self, key: RecordKey, value: Vec<u8>) -> Result<()> {
let (tx, rx) = oneshot::channel();
self.command_tx
.send(SwarmCommand::PutRecord {
key,
value,
reply: tx,
})
.await
.context("Failed to send put record command")?;
rx.await.context("Put record command cancelled")?
}
async fn get_record(&self, key: RecordKey) -> Result<Vec<u8>, GetRecordError> {
let (tx, rx) = oneshot::channel();
self.command_tx
.send(SwarmCommand::GetRecord { key, reply: tx })
.await
.map_err(|e| {
GetRecordError::Backend(anyhow!("Failed to send get record command: {e}"))
})?;
let response = tokio::time::timeout(Duration::from_secs(30), rx)
.await
.map_err(|_| GetRecordError::Backend(anyhow!("Get record timed out")))?;
response.map_err(|_| GetRecordError::Backend(anyhow!("Get record command cancelled")))?
}
/// Start providing a content key in the DHT.
#[allow(dead_code)]
pub async fn start_providing(&self, key: &str) -> Result<()> {
let record_key = RecordKey::new(&key.as_bytes());
let (tx, rx) = oneshot::channel();
self.command_tx
.send(SwarmCommand::StartProviding {
key: record_key,
reply: tx,
})
.await
.context("Failed to send start providing command")?;
rx.await.context("Start providing command cancelled")?
}
/// Get all providers for a content key from the DHT.
#[allow(dead_code)]
pub async fn get_providers(&self, key: &str) -> Result<ProviderList> {
let record_key = RecordKey::new(&key.as_bytes());
let (tx, rx) = oneshot::channel();
self.command_tx
.send(SwarmCommand::GetProviders {
key: record_key,
reply: tx,
})
.await
.context("Failed to send get providers command")?;
let response = tokio::time::timeout(Duration::from_secs(30), rx)
.await
.context("Get providers timed out")?;
response.context("Get providers command cancelled")?
}
pub(super) fn shutdown(&self) {
let command_tx = self.command_tx.clone();
tokio::spawn(async move {
if let Err(err) = command_tx.send(SwarmCommand::Shutdown).await {
warn!("Failed to send P2P shutdown command: {:?}", err);
}
});
}
}
impl PeerDiscovery for P2pDiscovery {
fn discover_by_worker_id(
&self,
worker_id: WorkerId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
let this = self.clone();
Box::pin(async move {
let key = RecordKey::new(&worker_id.as_u64().to_be_bytes());
let value = this
.get_record(key)
.await
.map_err(DiscoveryQueryError::from)?;
let peer_info = Self::decode_peer(&value).map_err(DiscoveryQueryError::from)?;
Ok(peer_info)
})
}
fn discover_by_instance_id(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<PeerInfo, DiscoveryQueryError>> {
let this = self.clone();
Box::pin(async move {
let key = RecordKey::new(instance_id.as_bytes());
let value = this
.get_record(key)
.await
.map_err(DiscoveryQueryError::from)?;
let peer_info = Self::decode_peer(&value).map_err(DiscoveryQueryError::from)?;
Ok(peer_info)
})
}
fn register_instance(
&self,
instance_id: InstanceId,
worker_address: WorkerAddress,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
let this = self.clone();
Box::pin(async move {
let worker_id = instance_id.worker_id();
let desired_peer = PeerInfo::new(instance_id, worker_address.clone());
// Collision detection on worker_id
let worker_key = RecordKey::new(&worker_id.as_u64().to_be_bytes());
match this.get_record(worker_key.clone()).await {
Ok(existing) => match Self::decode_peer(&existing) {
Ok(stored) => {
if stored.instance_id != instance_id {
return Err(DiscoveryError::WorkerIdCollision(
worker_id,
stored.instance_id,
instance_id,
));
}
if stored.address_checksum() != desired_peer.address_checksum() {
return Err(DiscoveryError::ChecksumMismatch(
instance_id,
stored.address_checksum(),
desired_peer.address_checksum(),
));
}
return Err(DiscoveryError::Backend(anyhow!(
"Instance {instance_id} already registered"
)));
}
Err(GetRecordError::NotFound) => {}
Err(GetRecordError::Backend(err)) => return Err(DiscoveryError::Backend(err)),
},
Err(GetRecordError::NotFound) => {}
Err(GetRecordError::Backend(err)) => return Err(DiscoveryError::Backend(err)),
}
// Check existing instance record for checksum mismatch
let instance_key = RecordKey::new(instance_id.as_bytes());
match this.get_record(instance_key.clone()).await {
Ok(existing) => match Self::decode_peer(&existing) {
Ok(stored) => {
if stored.address_checksum() != desired_peer.address_checksum() {
return Err(DiscoveryError::ChecksumMismatch(
instance_id,
stored.address_checksum(),
desired_peer.address_checksum(),
));
}
// Identical record already exists, treat as success (idempotent)
return Ok(());
}
Err(GetRecordError::NotFound) => {}
Err(GetRecordError::Backend(err)) => return Err(DiscoveryError::Backend(err)),
},
Err(GetRecordError::NotFound) => {}
Err(GetRecordError::Backend(err)) => return Err(DiscoveryError::Backend(err)),
}
let payload = serde_json::to_vec(&desired_peer)
.context("Failed to serialize PeerInfo")
.map_err(DiscoveryError::Backend)?;
this.put_record(worker_key, payload.clone())
.await
.map_err(DiscoveryError::Backend)?;
this.put_record(instance_key, payload)
.await
.map_err(DiscoveryError::Backend)?;
Ok(())
})
}
fn unregister_instance(
&self,
instance_id: InstanceId,
) -> BoxFuture<'static, Result<(), DiscoveryError>> {
let this = self.clone();
Box::pin(async move {
let worker_key = RecordKey::new(&instance_id.worker_id().as_u64().to_be_bytes());
let instance_key = RecordKey::new(instance_id.as_bytes());
this.put_record(worker_key, Vec::new())
.await
.map_err(DiscoveryError::Backend)?;
this.put_record(instance_key, Vec::new())
.await
.map_err(DiscoveryError::Backend)?;
debug!(
"Published tombstone for instance {} (worker_id {})",
instance_id,
instance_id.worker_id()
);
Ok(())
})
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment