Unverified Commit e5850e23 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent b302ec41
......@@ -1448,9 +1448,9 @@ dependencies = [
[[package]]
name = "cudarc"
version = "0.19.2"
version = "0.19.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aed81f178e780f3d5d354d12b4c5c5a484c4a9c329ecd037ac57f2a0e0648397"
checksum = "6468cb7fa330840f3ebcd8df51edc0e7bf5c18df524792ce6004c6821851cdf3"
dependencies = [
"half 2.7.1",
"libloading 0.9.0",
......@@ -1772,9 +1772,9 @@ dependencies = [
[[package]]
name = "dispatch2"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec"
checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
dependencies = [
"bitflags 2.11.0",
"objc2",
......@@ -1881,6 +1881,7 @@ dependencies = [
"clap 4.5.60",
"dashmap 6.1.0",
"derive-getters",
"derive_builder",
"dynamo-bench",
"dynamo-mocker",
"dynamo-runtime",
......@@ -1902,6 +1903,7 @@ dependencies = [
"tokio-util",
"tracing",
"uuid",
"validator",
"xxhash-rust",
]
......@@ -3620,9 +3622,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.88"
version = "0.3.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7e709f3e3d22866f9c25b3aff01af289b18422cc8b4262fb19103ee80fe513d"
checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
dependencies = [
"once_cell",
"wasm-bindgen",
......@@ -4003,7 +4005,7 @@ checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
dependencies = [
"bitflags 2.11.0",
"libc",
"redox_syscall 0.7.1",
"redox_syscall 0.7.3",
]
[[package]]
......@@ -4823,9 +4825,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "objc2"
version = "0.6.3"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c2599ce0ec54857b29ce62166b0ed9b4f6f1a70ccc9a71165b6154caca8c05"
checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
dependencies = [
"objc2-encode",
]
......@@ -5426,18 +5428,18 @@ dependencies = [
[[package]]
name = "pin-project"
version = "1.1.10"
version = "1.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.10"
version = "1.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
dependencies = [
"proc-macro2",
"quote",
......@@ -5446,9 +5448,9 @@ dependencies = [
[[package]]
name = "pin-project-lite"
version = "0.2.16"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "pin-utils"
......@@ -6112,9 +6114,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.7.1"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35985aa610addc02e24fc232012c86fd11f14111180f902b67e2d5331f8ebf2b"
checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
dependencies = [
"bitflags 2.11.0",
]
......@@ -6175,9 +6177,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.8.9"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "relative-path"
......@@ -6286,9 +6288,9 @@ dependencies = [
[[package]]
name = "rgb"
version = "0.8.52"
version = "0.8.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce"
checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4"
[[package]]
name = "ring"
......@@ -6559,9 +6561,9 @@ dependencies = [
[[package]]
name = "rustls"
version = "0.23.36"
version = "0.23.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b"
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
dependencies = [
"aws-lc-rs",
"log",
......@@ -7004,9 +7006,9 @@ dependencies = [
[[package]]
name = "serde_with"
version = "3.16.1"
version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7"
checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9"
dependencies = [
"base64 0.22.1",
"chrono",
......@@ -7023,9 +7025,9 @@ dependencies = [
[[package]]
name = "serde_with_macros"
version = "3.16.1"
version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c"
checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0"
dependencies = [
"darling 0.21.3",
"proc-macro2",
......@@ -7424,9 +7426,9 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.25.0"
version = "3.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1"
checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0"
dependencies = [
"fastrand",
"getrandom 0.4.1",
......@@ -8644,9 +8646,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen"
version = "0.2.111"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec1adf1535672f5b7824f817792b1afd731d7e843d2d04ec8f27e8cb51edd8ac"
checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
dependencies = [
"cfg-if 1.0.4",
"once_cell",
......@@ -8657,9 +8659,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.61"
version = "0.4.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe88540d1c934c4ec8e6db0afa536876c5441289d7f9f9123d4f065ac1250a6b"
checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
dependencies = [
"cfg-if 1.0.4",
"futures-util",
......@@ -8671,9 +8673,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.111"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19e638317c08b21663aed4d2b9a2091450548954695ff4efa75bff5fa546b3b1"
checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
......@@ -8681,9 +8683,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.111"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c64760850114d03d5f65457e96fc988f11f01d38fbaa51b254e4ab5809102af"
checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
dependencies = [
"bumpalo",
"proc-macro2",
......@@ -8694,9 +8696,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.111"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60eecd4fe26177cfa3339eb00b4a36445889ba3ad37080c2429879718e20ca41"
checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
dependencies = [
"unicode-ident",
]
......@@ -8750,9 +8752,9 @@ dependencies = [
[[package]]
name = "web-sys"
version = "0.3.88"
version = "0.3.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d6bb20ed2d9572df8584f6dc81d68a41a625cadc6f15999d649a70ce7e3597a"
checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
dependencies = [
"js-sys",
"wasm-bindgen",
......@@ -9313,18 +9315,18 @@ dependencies = [
[[package]]
name = "zerocopy"
version = "0.8.39"
version = "0.8.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a"
checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.39"
version = "0.8.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517"
checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953"
dependencies = [
"proc-macro2",
"quote",
......
......@@ -1429,9 +1429,9 @@ dependencies = [
[[package]]
name = "dispatch2"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec"
checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
dependencies = [
"bitflags 2.11.0",
"objc2",
......@@ -1521,6 +1521,7 @@ dependencies = [
"async-trait",
"dashmap 6.1.0",
"derive-getters",
"derive_builder",
"dynamo-runtime",
"dynamo-tokens",
"flume",
......@@ -1534,6 +1535,7 @@ dependencies = [
"tokio-util",
"tracing",
"uuid",
"validator",
"xxhash-rust",
]
......@@ -3333,7 +3335,7 @@ checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
dependencies = [
"bitflags 2.11.0",
"libc",
"redox_syscall 0.7.2",
"redox_syscall 0.7.3",
]
[[package]]
......@@ -4056,9 +4058,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "objc2"
version = "0.6.3"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c2599ce0ec54857b29ce62166b0ed9b4f6f1a70ccc9a71165b6154caca8c05"
checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
dependencies = [
"objc2-encode",
]
......@@ -4653,18 +4655,18 @@ dependencies = [
[[package]]
name = "pin-project"
version = "1.1.10"
version = "1.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.10"
version = "1.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
dependencies = [
"proc-macro2",
"quote",
......@@ -4673,9 +4675,9 @@ dependencies = [
[[package]]
name = "pin-project-lite"
version = "0.2.16"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "pin-utils"
......@@ -5349,9 +5351,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.7.2"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d94dd2f7cd932d4dc02cc8b2b50dfd38bd079a4e5d79198b99743d7fcf9a4b4"
checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
dependencies = [
"bitflags 2.11.0",
]
......@@ -8079,18 +8081,18 @@ dependencies = [
[[package]]
name = "zerocopy"
version = "0.8.39"
version = "0.8.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a"
checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.39"
version = "0.8.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517"
checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953"
dependencies = [
"proc-macro2",
"quote",
......
......@@ -806,9 +806,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "chrono"
version = "0.4.43"
version = "0.4.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118"
checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
dependencies = [
"iana-time-zone",
"js-sys",
......@@ -1124,9 +1124,9 @@ dependencies = [
[[package]]
name = "cudarc"
version = "0.19.2"
version = "0.19.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aed81f178e780f3d5d354d12b4c5c5a484c4a9c329ecd037ac57f2a0e0648397"
checksum = "6468cb7fa330840f3ebcd8df51edc0e7bf5c18df524792ce6004c6821851cdf3"
dependencies = [
"libloading 0.9.0",
]
......@@ -1288,9 +1288,9 @@ dependencies = [
[[package]]
name = "deranged"
version = "0.5.6"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc3dc5ad92c2e2d1c193bbbbdf2ea477cb81331de4f3103f267ca18368b988c4"
checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
dependencies = [
"powerfmt",
"serde_core",
......@@ -1447,9 +1447,9 @@ dependencies = [
[[package]]
name = "dispatch2"
version = "0.3.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec"
checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38"
dependencies = [
"bitflags 2.11.0",
"objc2",
......@@ -1529,6 +1529,7 @@ dependencies = [
"async-trait",
"dashmap 6.1.0",
"derive-getters",
"derive_builder",
"dynamo-runtime",
"dynamo-tokens",
"flume",
......@@ -1542,6 +1543,7 @@ dependencies = [
"tokio-util",
"tracing",
"uuid",
"validator",
"xxhash-rust",
]
......@@ -3022,9 +3024,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
[[package]]
name = "jiff"
version = "0.2.20"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c867c356cc096b33f4981825ab281ecba3db0acefe60329f044c1789d94c6543"
checksum = "b3e3d65f018c6ae946ab16e80944b97096ed73c35b221d1c478a6c81d8f57940"
dependencies = [
"jiff-static",
"jiff-tzdb-platform",
......@@ -3037,9 +3039,9 @@ dependencies = [
[[package]]
name = "jiff-static"
version = "0.2.20"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7946b4325269738f270bb55b3c19ab5c5040525f83fd625259422a9d25d9be5"
checksum = "a17c2b211d863c7fde02cbea8a3c1a439b98e109286554f2860bdded7ff83818"
dependencies = [
"proc-macro2",
"quote",
......@@ -3073,9 +3075,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.86"
version = "0.3.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d36139f1c97c42c0c86a411910b04e48d4939a0376e6e0f989420cbdee0120e5"
checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
dependencies = [
"once_cell",
"wasm-bindgen",
......@@ -3372,7 +3374,7 @@ checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
dependencies = [
"bitflags 2.11.0",
"libc",
"redox_syscall 0.7.1",
"redox_syscall 0.7.3",
]
[[package]]
......@@ -3383,9 +3385,9 @@ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
[[package]]
name = "linux-raw-sys"
version = "0.11.0"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
[[package]]
name = "litemap"
......@@ -4104,9 +4106,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "objc2"
version = "0.6.3"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7c2599ce0ec54857b29ce62166b0ed9b4f6f1a70ccc9a71165b6154caca8c05"
checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f"
dependencies = [
"objc2-encode",
]
......@@ -4701,18 +4703,18 @@ dependencies = [
[[package]]
name = "pin-project"
version = "1.1.10"
version = "1.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.10"
version = "1.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
dependencies = [
"proc-macro2",
"quote",
......@@ -4721,9 +4723,9 @@ dependencies = [
[[package]]
name = "pin-project-lite"
version = "0.2.16"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "pin-utils"
......@@ -5014,9 +5016,9 @@ dependencies = [
[[package]]
name = "pulldown-cmark"
version = "0.13.0"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0"
checksum = "83c41efbf8f90ac44de7f3a868f0867851d261b56291732d0cbf7cceaaeb55a6"
dependencies = [
"bitflags 2.11.0",
"memchr",
......@@ -5407,9 +5409,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.7.1"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35985aa610addc02e24fc232012c86fd11f14111180f902b67e2d5331f8ebf2b"
checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
dependencies = [
"bitflags 2.11.0",
]
......@@ -5470,9 +5472,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.8.9"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "reqwest"
......@@ -5539,9 +5541,9 @@ dependencies = [
[[package]]
name = "rgb"
version = "0.8.52"
version = "0.8.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce"
checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4"
[[package]]
name = "ring"
......@@ -5670,22 +5672,22 @@ dependencies = [
[[package]]
name = "rustix"
version = "1.1.3"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34"
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
dependencies = [
"bitflags 2.11.0",
"errno",
"libc",
"linux-raw-sys 0.11.0",
"linux-raw-sys 0.12.1",
"windows-sys 0.61.2",
]
[[package]]
name = "rustls"
version = "0.23.36"
version = "0.23.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b"
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
dependencies = [
"aws-lc-rs",
"log",
......@@ -6091,9 +6093,9 @@ dependencies = [
[[package]]
name = "serde_with"
version = "3.16.1"
version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7"
checksum = "381b283ce7bc6b476d903296fb59d0d36633652b633b27f64db4fb46dcbfc3b9"
dependencies = [
"base64 0.22.1",
"chrono",
......@@ -6110,9 +6112,9 @@ dependencies = [
[[package]]
name = "serde_with_macros"
version = "3.16.1"
version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c"
checksum = "a6d4e30573c8cb306ed6ab1dca8423eec9a463ea0e155f45399455e0368b27e0"
dependencies = [
"darling 0.21.3",
"proc-macro2",
......@@ -6421,14 +6423,14 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
[[package]]
name = "tempfile"
version = "3.25.0"
version = "3.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1"
checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0"
dependencies = [
"fastrand",
"getrandom 0.4.1",
"once_cell",
"rustix 1.1.3",
"rustix 1.1.4",
"windows-sys 0.61.2",
]
......@@ -7495,9 +7497,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen"
version = "0.2.109"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ff9c7baef35ac3c0e17d8bfc9ad75eb62f85a2f02bccc906699dadb0aa9c622"
checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
dependencies = [
"cfg-if 1.0.4",
"once_cell",
......@@ -7508,9 +7510,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.59"
version = "0.4.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24699cd39db9966cf6e2ef10d2f72779c961ad905911f395ea201c3ec9f545d"
checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
dependencies = [
"cfg-if 1.0.4",
"futures-util",
......@@ -7522,9 +7524,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.109"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39455e84ad887a0bbc93c116d72403f1bb0a39e37dd6f235a43e2128a0c7f1fd"
checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
......@@ -7532,9 +7534,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.109"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff4761f60b0b51fd13fec8764167b7bbcc34498ce3e52805fe1db6f2d56b6d6"
checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
dependencies = [
"bumpalo",
"proc-macro2",
......@@ -7545,9 +7547,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.109"
version = "0.2.113"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc6a171c53d98021a93a474c4a4579d76ba97f9517d871bc12e27640f218b6dd"
checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
dependencies = [
"unicode-ident",
]
......@@ -7601,9 +7603,9 @@ dependencies = [
[[package]]
name = "web-sys"
version = "0.3.86"
version = "0.3.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "668fa5d00434e890a452ab060d24e3904d1be93f7bb01b70e5603baa2b8ab23b"
checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
dependencies = [
"js-sys",
"wasm-bindgen",
......@@ -8154,18 +8156,18 @@ dependencies = [
[[package]]
name = "zerocopy"
version = "0.8.39"
version = "0.8.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a"
checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.39"
version = "0.8.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517"
checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953"
dependencies = [
"proc-macro2",
"quote",
......
......@@ -24,6 +24,7 @@ dynamo-tokens = { workspace = true }
anyhow = { workspace = true }
async-trait = { workspace = true }
dashmap = { workspace = true }
derive_builder = { workspace = true }
derive-getters = { workspace = true }
prometheus = { workspace = true }
rand = { workspace = true }
......@@ -34,6 +35,7 @@ tokio = { workspace = true }
tokio-util = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
validator = { workspace = true }
xxhash-rust = { workspace = true }
# dependencies
......@@ -71,3 +73,8 @@ required-features = ["bench"]
name = "mooncake_bench"
harness = false
required-features = ["bench"]
[[bench]]
name = "active_sequences_bench"
harness = false
required-features = ["bench"]
This diff is collapsed.
This diff is collapsed.
......@@ -13,11 +13,14 @@
//! cargo bench --package dynamo-kv-router --bench kv_indexer_bench --features bench -- microbench --help
//! cargo bench --package dynamo-kv-router --bench kv_indexer_bench --features bench -- stress --help
#[path = "common/mod.rs"]
mod common;
use common::{SequenceData, generate_sequences};
use clap::{Args, Parser, Subcommand, ValueEnum};
use dynamo_bench::common::LatencyStats;
use dynamo_kv_router::{
ConcurrentRadixTree,
bench_utils::{SequenceData, generate_sequences},
indexer::{
KvIndexer, KvIndexerInterface, KvIndexerMetrics, KvIndexerSharded, ThreadPoolIndexer,
},
......@@ -1280,7 +1283,7 @@ async fn run_stress_mode(args: StressArgs) {
// Test single indexer
if matches!(args.indexer_type, IndexerType::Single | IndexerType::All) {
let token = CancellationToken::new();
let mut indexer = KvIndexer::new(token.clone(), args.common.block_size, metrics.clone());
let indexer = KvIndexer::new(token.clone(), args.common.block_size, metrics.clone());
println!(
"\n Applying {} store events to KvIndexer...",
......@@ -1290,7 +1293,7 @@ async fn run_stress_mode(args: StressArgs) {
for (event_id, seq) in sequences.iter().enumerate() {
let event = seq.to_store_event(event_id as u64);
KvIndexerInterface::apply_event(&mut indexer, event).await;
KvIndexerInterface::apply_event(&indexer, event).await;
if args.common.verbose && (event_id + 1) % 100 == 0 {
println!(" Applied {}/{} events...", event_id + 1, sequences.len());
......@@ -1322,7 +1325,7 @@ async fn run_stress_mode(args: StressArgs) {
// Test sharded indexer
if matches!(args.indexer_type, IndexerType::Sharded | IndexerType::All) {
let token = CancellationToken::new();
let mut indexer = KvIndexerSharded::new(
let indexer = KvIndexerSharded::new(
token.clone(),
args.num_shards,
args.common.block_size,
......@@ -1337,7 +1340,7 @@ async fn run_stress_mode(args: StressArgs) {
for (event_id, seq) in sequences.iter().enumerate() {
let event = seq.to_store_event(event_id as u64);
KvIndexerInterface::apply_event(&mut indexer, event).await;
KvIndexerInterface::apply_event(&indexer, event).await;
if args.common.verbose && (event_id + 1) % 100 == 0 {
println!(" Applied {}/{} events...", event_id + 1, sequences.len());
......
This diff is collapsed.
......@@ -13,13 +13,15 @@
//!
//! Run with: cargo bench --package dynamo-kv-router --bench radix_tree_microbench --features bench -- --help
#[path = "common/mod.rs"]
mod common;
use common::{SequenceData, generate_sequences};
use clap::{Parser, ValueEnum};
use dynamo_bench::common::LatencyStats;
use dynamo_kv_router::{
ConcurrentRadixTree, OverlapScores, PositionalIndexer, RadixTree, RouterEvent, SyncIndexer,
bench_utils::{SequenceData, generate_sequences},
compute_block_hash_for_seq,
protocols::LocalBlockHash,
compute_block_hash_for_seq, protocols::LocalBlockHash,
};
use std::time::{Duration, Instant};
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Benchmark utilities for kv-router benchmarks.
//!
//! This module provides shared data structures for benchmarking:
//! - `SequenceData`: Pre-generated sequence data for benchmarking
use crate::protocols::{
ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData,
KvCacheStoredBlockData, LocalBlockHash, RouterEvent, WorkerId, compute_seq_hash_for_block,
};
use rand::{Rng, SeedableRng, rngs::StdRng};
use std::time::Duration;
/// Pre-generated sequence data for benchmarking.
#[derive(Clone)]
pub struct SequenceData {
pub worker_id: WorkerId,
pub local_hashes: Vec<LocalBlockHash>,
pub external_hashes: Vec<ExternalSequenceBlockHash>,
}
impl SequenceData {
/// Create a new sequence with synthetic hashes based on sequence ID.
pub fn new(seq_id: u64, worker_id: WorkerId, depth: usize) -> Self {
let local_hashes: Vec<LocalBlockHash> = (0..depth)
.map(|block_idx| LocalBlockHash((seq_id << 32) | (block_idx as u64)))
.collect();
let external_hashes: Vec<ExternalSequenceBlockHash> = (0..depth)
.map(|block_idx| ExternalSequenceBlockHash((seq_id << 32) | (block_idx as u64)))
.collect();
Self {
worker_id,
local_hashes,
external_hashes,
}
}
/// Create a sequence from local hashes, computing external hashes using cumulative hash.
///
/// This ensures FlatHashMap can correctly identify block positions.
pub fn from_local_hashes(worker_id: WorkerId, local_hashes: Vec<LocalBlockHash>) -> Self {
let seq_hashes = compute_seq_hash_for_block(&local_hashes);
let external_hashes = seq_hashes
.into_iter()
.map(ExternalSequenceBlockHash)
.collect();
Self {
worker_id,
local_hashes,
external_hashes,
}
}
/// Convert to a store event.
pub fn to_store_event(&self, event_id: u64) -> RouterEvent {
RouterEvent {
worker_id: self.worker_id,
event: KvCacheEvent {
event_id,
data: KvCacheEventData::Stored(KvCacheStoreData {
parent_hash: None,
blocks: self
.local_hashes
.iter()
.zip(self.external_hashes.iter())
.map(|(local, ext)| KvCacheStoredBlockData {
tokens_hash: *local,
block_hash: *ext,
mm_extra_info: None,
})
.collect(),
}),
dp_rank: 0,
},
}
}
/// Convert to a remove event.
pub fn to_remove_event(&self, event_id: u64) -> RouterEvent {
RouterEvent {
worker_id: self.worker_id,
event: KvCacheEvent {
event_id,
data: KvCacheEventData::Removed(KvCacheRemoveData {
block_hashes: self.external_hashes.clone(),
}),
dp_rank: 0,
},
}
}
}
/// Generate sequences with shared prefix prompts.
///
/// # Arguments
/// * `num_sequences` - Number of sequences to generate
/// * `depth` - Number of blocks per sequence
/// * `num_workers` - Number of workers to distribute sequences across
/// * `prefix_ratio` - Ratio of blocks that share a prefix (0.0 to 1.0)
/// * `num_prefix_groups` - Number of distinct prefix groups
/// * `seed` - Random seed for reproducibility
/// * `use_cumulative_hash` - If true, use `from_local_hashes` for proper cumulative hashes
pub fn generate_sequences(
num_sequences: usize,
depth: usize,
num_workers: usize,
prefix_ratio: f64,
num_prefix_groups: usize,
seed: u64,
use_cumulative_hash: bool,
) -> Vec<SequenceData> {
let mut sequences = Vec::with_capacity(num_sequences);
let prefix_length = (depth as f64 * prefix_ratio).round() as usize;
let mut rng: StdRng = StdRng::seed_from_u64(seed);
for seq_id in 0..num_sequences {
let seq_id_u64 = seq_id as u64;
let worker_id = (seq_id % num_workers) as WorkerId;
// Determine prefix group for this sequence
let group_id = if num_prefix_groups > 0 && prefix_length > 0 {
Some(rng.random_range(0..num_prefix_groups) as u64)
} else {
None
};
// Build local_hashes: shared prefix (if applicable) + unique suffix
let local_hashes: Vec<LocalBlockHash> = (0..depth)
.map(|block_idx| {
let block_idx_u64 = block_idx as u64;
if let Some(gid) = group_id
&& block_idx < prefix_length
{
// Shared prefix based on group_id
return LocalBlockHash(0xDEAD_BEEF_0000_0000 | (gid << 32) | block_idx_u64);
}
// Unique suffix (or no shared prefix)
LocalBlockHash((seq_id_u64 << 32) | block_idx_u64)
})
.collect();
if use_cumulative_hash {
sequences.push(SequenceData::from_local_hashes(worker_id, local_hashes));
} else {
let external_hashes: Vec<ExternalSequenceBlockHash> = (0..depth)
.map(|block_idx| {
let block_idx_u64 = block_idx as u64;
if let Some(gid) = group_id
&& block_idx < prefix_length
{
return ExternalSequenceBlockHash(
0xDEAD_BEEF_0000_0000 | (gid << 32) | block_idx_u64,
);
}
ExternalSequenceBlockHash((seq_id_u64 << 32) | block_idx_u64)
})
.collect();
sequences.push(SequenceData {
worker_id,
local_hashes,
external_hashes,
});
}
}
sequences
}
/// Compute median of durations.
pub fn median(durations: &[Duration]) -> Duration {
if durations.is_empty() {
return Duration::ZERO;
}
let mut sorted = durations.to_vec();
sorted.sort();
sorted[sorted.len() / 2]
}
......@@ -7,34 +7,43 @@
//! efficient KV cache lookup and routing in distributed LLM inference systems.
pub mod approx;
#[cfg(feature = "bench")]
pub mod bench_utils;
pub mod concurrent_radix_tree;
pub mod indexer;
pub mod multi_worker_sequence;
#[cfg(feature = "bench")]
pub mod naive_indexers;
pub mod nested_map;
pub mod protocols;
pub mod radix_tree;
pub mod sequence;
pub mod scheduling;
pub mod sequences;
// Backward-compat re-exports: preserve old module paths for external consumers
pub use scheduling::config;
pub use scheduling::queue;
pub use scheduling::selector;
pub use sequences::multi_worker as multi_worker_sequence;
pub use sequences::single as sequence;
#[cfg(test)]
pub(crate) mod test_utils;
#[cfg(any(test, feature = "bench"))]
pub mod test_utils;
// Re-export key types for convenience
pub use concurrent_radix_tree::ConcurrentRadixTree;
pub use indexer::{MaybeError, SyncIndexer, ThreadPoolIndexer};
pub use multi_worker_sequence::{
pub use self::multi_worker_sequence::{
ActiveSequencesMultiWorker, SequenceError, SequencePublisher, SequenceRequest,
SequenceSubscriber,
};
pub use self::sequence::{ActiveSequences, RequestId};
pub use concurrent_radix_tree::ConcurrentRadixTree;
pub use config::{KvRouterConfig, RouterConfigOverride};
pub use indexer::{MaybeError, SyncIndexer, ThreadPoolIndexer};
#[cfg(feature = "bench")]
pub use naive_indexers::{InvertedIndex, NaiveNestedMap};
pub use nested_map::PositionalIndexer;
pub use protocols::{
KvCacheEventError, LocalBlockHash, OverlapScores, RouterEvent, WorkerId,
KvCacheEventError, LocalBlockHash, OverlapScores, RouterEvent, WorkerConfigLike, WorkerId,
compute_block_hash_for_seq,
};
pub use queue::SchedulerQueue;
pub use radix_tree::RadixTree;
pub use sequence::{ActiveSequences, RequestId};
pub use scheduling::{KvSchedulerError, PotentialLoad, SchedulingRequest, SchedulingResponse};
pub use selector::{DefaultWorkerSelector, WorkerSelector};
......@@ -92,6 +92,15 @@ pub fn compute_seq_hash_for_block(block_hashes: &[LocalBlockHash]) -> Vec<Sequen
sequence_hashes
}
/// Trait abstracting the worker configuration fields needed by the scheduling layer.
///
/// `ModelRuntimeConfig` (in `lib/llm`) implements this directly so no adapter type is needed.
pub trait WorkerConfigLike {
fn data_parallel_size(&self) -> u32;
fn max_num_batched_tokens(&self) -> Option<u64>;
fn total_kv_blocks(&self) -> Option<u64>;
}
/// A worker identifier.
pub type WorkerId = u64;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use derive_builder::Builder;
use rand::Rng;
use serde::{Deserialize, Serialize};
use validator::{Validate, ValidationError};
use crate::protocols::{compute_block_hash_for_seq, compute_seq_hash_for_block};
/// Override configuration for router settings that can be specified per-request
#[derive(Debug, Clone, Default, Builder, Serialize, Deserialize, Validate)]
pub struct RouterConfigOverride {
#[builder(default)]
pub overlap_score_weight: Option<f64>,
#[builder(default)]
#[validate(range(min = 0.0))]
pub router_temperature: Option<f64>,
#[builder(default)]
pub assume_kv_reuse: Option<bool>,
}
/// KV Router configuration parameters
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Validate)]
#[validate(schema(function = "validate_kv_router_config"))]
pub struct KvRouterConfig {
#[validate(range(min = 0.0))]
pub overlap_score_weight: f64,
#[validate(range(min = 0.0))]
pub router_temperature: f64,
pub use_kv_events: bool,
/// **Deprecated:** Enable durable KV events using NATS JetStream instead of the default event plane.
/// This option will be removed in a future release. The event-plane subscriber
/// (local_indexer mode) is now the recommended path.
pub durable_kv_events: bool,
pub router_replica_sync: bool,
/// Whether to track active blocks in the router (default: true)
pub router_track_active_blocks: bool,
/// Whether to track output blocks during generation (default: false)
/// When enabled, the router adds placeholder blocks as tokens are generated
/// and applies fractional decay based on progress toward agent_hints.osl.
pub router_track_output_blocks: bool,
/// Whether to assume KV cache reuse when tracking active blocks (default: true).
/// When true, computes actual block hashes for sequence tracking.
/// When false, generates random hashes (assuming no KV cache reuse).
pub router_assume_kv_reuse: bool,
/// Threshold for triggering snapshots. If None, no snapshots will be performed.
#[validate(range(min = 1))]
pub router_snapshot_threshold: Option<u32>,
/// Whether to reset the router state on startup (default: false)
pub router_reset_states: bool,
/// TTL for blocks in seconds (only used when use_kv_events is false, default: 120.0)
#[validate(range(min = 0.0))]
pub router_ttl_secs: f64,
/// Maximum tree size before pruning (only used when use_kv_events is false, default: 2^20 = 1048576)
#[validate(range(min = 1))]
pub router_max_tree_size: usize,
/// Target size ratio after pruning (only used when use_kv_events is false, default: 0.8)
#[validate(range(min = 0.0, max = 1.0))]
pub router_prune_target_ratio: f64,
/// Queue threshold fraction for prefill token capacity.
/// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens.
/// If None (default), queueing is disabled and all requests go directly to ready.
/// Must be > 0.
#[validate(range(min = 0.0))]
pub router_queue_threshold: Option<f64>,
/// Number of event processing threads for the KV indexer.
/// When > 1, uses ConcurrentRadixTree with a thread pool instead of the
/// single-threaded RadixTree. Default: 4.
#[validate(range(min = 1))]
pub router_event_threads: u32,
/// Enable cache control (PIN with TTL) via the worker's cache_control service mesh endpoint.
/// When true, the router creates a cache_control client and honors nvext.cache_control on
/// requests, firing a pin_prefix call (with TTL) to the worker after generation completes.
/// When false (default), cache_control is ignored and no cache_control client is created.
pub router_enable_cache_control: bool,
}
impl Default for KvRouterConfig {
fn default() -> Self {
Self {
overlap_score_weight: 1.0,
router_temperature: 0.0,
use_kv_events: true,
durable_kv_events: false, // default to NATS Core (local indexer mode)
router_replica_sync: false,
router_track_active_blocks: true,
router_track_output_blocks: false,
router_assume_kv_reuse: true,
router_snapshot_threshold: Some(1000000),
router_reset_states: false,
router_ttl_secs: 120.0,
router_max_tree_size: 2usize.pow(20), // 2^20 = 1048576, matches PruneConfig::default()
router_prune_target_ratio: 0.8,
router_queue_threshold: None,
router_event_threads: 4,
router_enable_cache_control: false,
}
}
}
fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationError> {
if config.durable_kv_events {
tracing::warn!(
"--durable-kv-events is deprecated and will be removed in a future release. \
The event-plane subscriber (local_indexer mode) is now the recommended path."
);
}
if config.durable_kv_events && !config.use_kv_events {
return Err(ValidationError::new(
"durable_kv_events requires use_kv_events=true",
));
}
if config.router_track_output_blocks && !config.router_track_active_blocks {
return Err(ValidationError::new(
"router_track_output_blocks requires router_track_active_blocks=true",
));
}
Ok(())
}
impl KvRouterConfig {
/// Compute sequence hashes for active block tracking based on configuration.
///
/// Returns:
/// - `None` if `router_track_active_blocks` is false
/// - Random hashes if `router_track_active_blocks` is true but `router_assume_kv_reuse` is false
/// - Actual sequence hashes if both are true
pub fn compute_seq_hashes_for_tracking(
&self,
tokens: &[u32],
block_size: u32,
config_override: Option<&RouterConfigOverride>,
lora_name: Option<&str>,
) -> Option<Vec<u64>> {
if !self.router_track_active_blocks {
return None;
}
let num_blocks = tokens.len() / block_size as usize;
if num_blocks == 0 {
return Some(Vec::new());
}
let assume_kv_reuse = config_override
.and_then(|cfg| cfg.assume_kv_reuse)
.unwrap_or(self.router_assume_kv_reuse);
if assume_kv_reuse {
let block_hashes = compute_block_hash_for_seq(tokens, block_size, None, lora_name);
Some(compute_seq_hash_for_block(&block_hashes))
} else {
let mut rng = rand::rng();
Some((0..num_blocks).map(|_| rng.random::<u64>()).collect())
}
}
/// Check if KV event subscription should be started.
///
/// Returns false if:
/// - KV events are disabled (`use_kv_events=false`)
/// - Overlap scoring is disabled (`overlap_score_weight=0`)
///
/// When false, the router skips starting the KV event subscription entirely,
/// avoiding the need to query workers for their local indexer state.
pub fn should_subscribe_to_kv_events(&self) -> bool {
self.use_kv_events && self.overlap_score_weight > 0.0
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod config;
pub mod queue;
pub mod selector;
mod types;
pub use types::*;
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap};
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::Mutex;
use tokio::sync::watch;
use super::selector::WorkerSelector;
use super::types::{SchedulingRequest, SchedulingResponse};
use crate::protocols::{WorkerConfigLike, WorkerId, WorkerWithDpRank};
use crate::sequences::{ActiveSequencesMultiWorker, SequencePublisher, SequenceRequest};
/// Large default for max_num_batched_tokens when not configured (effectively disables queueing for that worker)
pub const DEFAULT_MAX_BATCHED_TOKENS: u64 = 10_000_000;
/// Entry in the priority queue, ordered by effective arrival time (lower = higher priority).
/// Effective arrival = elapsed time since queue start minus `priority_jump`.
struct QueueEntry {
effective_offset: Duration,
request: SchedulingRequest,
}
impl Eq for QueueEntry {}
impl PartialEq for QueueEntry {
fn eq(&self, other: &Self) -> bool {
self.effective_offset == other.effective_offset
}
}
impl Ord for QueueEntry {
fn cmp(&self, other: &Self) -> Ordering {
// BinaryHeap is a max-heap; reverse so lower effective_offset = higher priority
other.effective_offset.cmp(&self.effective_offset)
}
}
impl PartialOrd for QueueEntry {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// Queue that gates scheduling requests behind a capacity check.
/// When all workers exceed `threshold_frac` utilisation the request is parked in `pending`.
/// When capacity frees up (`update()`), pending requests are scheduled in priority order.
/// If queueing is disabled (threshold_frac is None), requests are scheduled immediately.
pub struct SchedulerQueue<P: SequencePublisher, C: WorkerConfigLike> {
pending: Mutex<BinaryHeap<QueueEntry>>,
slots: Arc<ActiveSequencesMultiWorker<P>>,
workers_with_configs: watch::Receiver<HashMap<WorkerId, C>>,
/// Cached threshold fraction; None means queueing is disabled.
threshold_frac: Option<f64>,
/// Reference instant for computing arrival offsets.
start_time: Instant,
block_size: u32,
selector: Box<dyn WorkerSelector<C> + Send + Sync>,
}
impl<P: SequencePublisher + 'static, C: WorkerConfigLike> SchedulerQueue<P, C> {
pub fn new(
slots: Arc<ActiveSequencesMultiWorker<P>>,
workers_with_configs: watch::Receiver<HashMap<WorkerId, C>>,
threshold_frac: Option<f64>,
block_size: u32,
selector: Box<dyn WorkerSelector<C> + Send + Sync>,
) -> Self {
if let Some(frac) = threshold_frac {
tracing::info!("Router queue enabled with threshold fraction {frac}");
}
Self {
pending: Mutex::new(BinaryHeap::new()),
slots,
workers_with_configs,
threshold_frac,
start_time: Instant::now(),
block_size,
selector,
}
}
/// Build a QueueEntry for a request, computing its effective arrival offset.
fn make_entry(&self, request: SchedulingRequest) -> QueueEntry {
let arrival_offset = self.start_time.elapsed();
let jump = Duration::from_secs_f64(request.priority_jump.max(0.0));
let effective_offset = arrival_offset.saturating_sub(jump);
QueueEntry {
effective_offset,
request,
}
}
/// Enqueue a new request.
/// If queueing is disabled or workers have capacity, schedule immediately.
/// Otherwise park in the pending heap.
pub async fn enqueue(&self, request: SchedulingRequest) {
let Some(threshold) = self.threshold_frac else {
self.schedule(request).await;
return;
};
if self.all_workers_busy(threshold) {
tracing::debug!("all workers busy, queueing request");
let entry = self.make_entry(request);
self.pending.lock().await.push(entry);
} else {
self.schedule(request).await;
}
}
/// Called on prefill_complete/free. Drains pending requests while workers have capacity.
/// Each scheduled request updates active_tokens via add_request, so the busy check
/// sees fresh state on the next iteration.
pub async fn update(&self) {
let Some(threshold) = self.threshold_frac else {
return;
};
loop {
if self.all_workers_busy(threshold) {
break;
}
let Some(entry) = self.pending.lock().await.pop() else {
break;
};
tracing::debug!("scheduling request from pending queue");
self.schedule(entry.request).await;
}
}
/// Run the full scheduling pipeline for a single request:
/// compute potential load -> select worker -> respond -> book via add_request.
async fn schedule(&self, mut request: SchedulingRequest) {
let (decode_blocks, prefill_tokens) = self.slots.potential_blocks_and_tokens(
request.token_seq.as_deref(),
request.isl_tokens,
request.overlaps.clone(),
);
request.decode_blocks = decode_blocks;
request.prefill_tokens = prefill_tokens;
let selection = {
let workers = self.workers_with_configs.borrow();
self.selector
.select_worker(&workers, &request, self.block_size)
};
let selection = match selection {
Ok(s) => s,
Err(e) => {
tracing::warn!("scheduling failed: {e}");
request.respond(Err(e));
return;
}
};
request.respond(Ok(SchedulingResponse {
best_worker: selection.worker,
overlap_blocks: selection.overlap_blocks,
}));
if !request.update_states {
return;
}
let Some(request_id) = request.maybe_request_id else {
tracing::error!("No request_id provided to add_request to the slot tracker");
return;
};
if let Err(e) = self
.slots
.add_request(SequenceRequest {
request_id: request_id.clone(),
token_sequence: request.token_seq,
isl: request.isl_tokens,
overlap: selection.overlap_blocks,
expected_output_tokens: None,
worker: selection.worker,
lora_name: request.lora_name.clone(),
})
.await
{
tracing::warn!("Failed to add request {request_id}: {e}");
}
}
/// Check if all workers are busy based on threshold.
/// Returns true only if ALL workers exceed the threshold (no worker has capacity).
fn all_workers_busy(&self, threshold: f64) -> bool {
let active_tokens = self.slots.active_tokens();
let configs = self.workers_with_configs.borrow();
for (&worker_id, config) in configs.iter() {
let dp_size = config.data_parallel_size();
let max_batched = config
.max_num_batched_tokens()
.unwrap_or(DEFAULT_MAX_BATCHED_TOKENS);
for dp_rank in 0..dp_size {
let worker = WorkerWithDpRank::new(worker_id, dp_rank);
let tokens = active_tokens.get(&worker).copied().unwrap_or(0);
if (tokens as f64) <= threshold * (max_batched as f64) {
return false;
}
}
}
true
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::watch;
use super::*;
use crate::protocols::OverlapScores;
use crate::selector::DefaultWorkerSelector;
use crate::sequences::ActiveSequencesMultiWorker;
use crate::test_utils::{NoopSequencePublisher, SimpleWorkerConfig};
fn make_queue(
num_workers: usize,
block_size: u32,
isl: usize,
threshold_frac: Option<f64>,
) -> (
Arc<SchedulerQueue<NoopSequencePublisher, SimpleWorkerConfig>>,
Arc<ActiveSequencesMultiWorker<NoopSequencePublisher>>,
) {
let dp_sizes: HashMap<u64, u32> = (0..num_workers as u64).map(|id| (id, 1)).collect();
let slots = Arc::new(ActiveSequencesMultiWorker::new(
NoopSequencePublisher,
block_size as usize,
dp_sizes,
false,
0,
"test",
));
let mut configs: HashMap<u64, SimpleWorkerConfig> = HashMap::new();
for id in 0..num_workers as u64 {
configs.insert(
id,
SimpleWorkerConfig {
max_num_batched_tokens: Some(isl as u64),
..Default::default()
},
);
}
let (cfg_tx, cfg_rx) = watch::channel(configs);
std::mem::forget(cfg_tx);
let selector = Box::new(DefaultWorkerSelector::default());
let queue = Arc::new(SchedulerQueue::new(
Arc::clone(&slots),
cfg_rx,
threshold_frac,
block_size,
selector,
));
(queue, slots)
}
fn make_request(
request_id: &str,
isl_tokens: usize,
) -> (
SchedulingRequest,
tokio::sync::oneshot::Receiver<
Result<SchedulingResponse, crate::scheduling::types::KvSchedulerError>,
>,
) {
let (tx, rx) = tokio::sync::oneshot::channel();
let req = SchedulingRequest {
maybe_request_id: Some(request_id.to_string()),
token_seq: None,
isl_tokens,
overlaps: OverlapScores::default(),
decode_blocks: HashMap::new(),
prefill_tokens: HashMap::new(),
router_config_override: None,
update_states: true,
lora_name: None,
priority_jump: 0.0,
allowed_worker_ids: None,
resp_tx: Some(tx),
};
(req, rx)
}
#[tokio::test(flavor = "multi_thread")]
async fn test_concurrent_flood() {
let block_size = 16;
let isl = 512;
let num_workers = 4;
let num_tasks = 25;
let (queue, slots) = make_queue(num_workers, block_size, isl, None);
let mut handles = Vec::new();
for i in 0..num_tasks {
let queue = Arc::clone(&queue);
let slots = Arc::clone(&slots);
handles.push(tokio::spawn(async move {
let req_id = format!("req-{i}");
let (req, rx) = make_request(&req_id, isl);
queue.enqueue(req).await;
let resp = rx.await.expect("oneshot dropped");
let resp = resp.expect("scheduling failed");
assert!(resp.best_worker.worker_id < num_workers as u64);
slots.mark_prefill_completed(&req_id).await.unwrap();
slots.free(&req_id).await.unwrap();
queue.update().await;
}));
}
for h in handles {
h.await.expect("task panicked");
}
let active = slots.active_tokens();
for (worker, tokens) in &active {
assert_eq!(
*tokens, 0,
"worker {worker:?} still has {tokens} active tokens"
);
}
}
#[tokio::test(flavor = "multi_thread")]
async fn test_queueing_under_pressure() {
let block_size = 16;
let isl = 512;
let num_workers = 2;
let num_requests = 10;
let (queue, slots) = make_queue(num_workers, block_size, isl, Some(0.0));
let mut receivers = Vec::new();
let mut req_ids = Vec::new();
for i in 0..num_requests {
let req_id = format!("pressure-{i}");
let (req, rx) = make_request(&req_id, isl);
queue.enqueue(req).await;
receivers.push(rx);
req_ids.push(req_id);
}
// Drain pending by cycling mark_prefill_completed + free + update
// on already-scheduled requests until all receivers have a response.
for _ in 0..num_requests {
queue.update().await;
for rid in &req_ids {
let _ = slots.mark_prefill_completed(rid).await;
let _ = slots.free(rid).await;
}
}
queue.update().await;
let mut ok_count = 0;
for mut rx in receivers {
if let Ok(result) = rx.try_recv() {
result.expect("scheduling returned error");
ok_count += 1;
}
}
assert_eq!(ok_count, num_requests, "not all requests were scheduled");
}
#[tokio::test]
async fn test_no_workers_returns_error() {
let (queue, _slots) = make_queue(0, 16, 512, None);
let (req, rx) = make_request("lonely-req", 512);
queue.enqueue(req).await;
let resp = rx.await.expect("oneshot dropped");
assert!(
matches!(
resp,
Err(crate::scheduling::types::KvSchedulerError::NoEndpoints)
),
"expected NoEndpoints, got {resp:?}"
);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use rand::Rng;
use super::config::KvRouterConfig;
use super::types::{KvSchedulerError, SchedulingRequest};
use crate::protocols::{WorkerConfigLike, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
/// A trait that users can implement to define custom selection logic.
///
/// Generic over `C` so that the scheduling layer does not depend on a concrete config type.
pub trait WorkerSelector<C: WorkerConfigLike> {
fn select_worker(
&self,
workers: &HashMap<WorkerId, C>,
request: &SchedulingRequest,
block_size: u32,
) -> Result<WorkerSelectionResult, KvSchedulerError>;
}
/// Helper function for softmax sampling.
/// Returns a vec of workers: multiple if tied, single if sampled.
fn softmax_sample(
logits: &HashMap<WorkerWithDpRank, f64>,
temperature: f64,
) -> Vec<WorkerWithDpRank> {
if logits.is_empty() {
panic!("Empty logits for softmax sampling");
}
// Guard: if temperature is 0, return all keys with the smallest logit value (ties)
if temperature == 0.0 {
let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));
let min_keys: Vec<_> = logits
.iter()
.filter(|&(_, &v)| v == min_logit)
.map(|(k, _)| *k)
.collect();
return min_keys;
}
let keys: Vec<_> = logits.keys().copied().collect();
let values: Vec<_> = logits.values().copied().collect();
let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let probabilities = if min_val == max_val {
vec![1.0 / keys.len() as f64; keys.len()]
} else {
// Fused normalize -> negate -> scale -> exp, then normalize probabilities
let range = max_val - min_val;
let scaled: Vec<f64> = values.iter().map(|&v| -(v / range) / temperature).collect();
let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let mut probs: Vec<f64> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();
let sum: f64 = probs.iter().sum();
probs.iter_mut().for_each(|p| *p /= sum);
probs
};
let mut rng = rand::rng();
let sample: f64 = rng.random();
let mut cumsum = 0.0;
for (i, &prob) in probabilities.iter().enumerate() {
cumsum += prob;
if sample <= cumsum {
return vec![keys[i]];
}
}
// Fallback to last key (shouldn't normally reach here)
vec![keys[keys.len() - 1]]
}
/// Default implementation matching the Python _cost_function.
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
pub kv_router_config: KvRouterConfig,
}
impl DefaultWorkerSelector {
pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
Self {
kv_router_config: kv_router_config.unwrap_or_default(),
}
}
}
impl<C: WorkerConfigLike> WorkerSelector<C> for DefaultWorkerSelector {
fn select_worker(
&self,
workers: &HashMap<WorkerId, C>,
request: &SchedulingRequest,
block_size: u32,
) -> Result<WorkerSelectionResult, KvSchedulerError> {
assert!(request.isl_tokens > 0);
let allowed_ids = request.allowed_worker_ids.as_ref();
if allowed_ids.map_or(workers.is_empty(), |ids| {
!workers.keys().any(|wid| ids.contains(wid))
}) {
return Err(KvSchedulerError::NoEndpoints);
}
let isl = request.isl_tokens;
let request_blocks = isl.div_ceil(block_size as usize);
let overlaps = &request.overlaps.scores;
let decode_blocks = &request.decode_blocks;
let prefill_tokens = &request.prefill_tokens;
let mut worker_logits = HashMap::new();
let overlap_weight = request
.router_config_override
.as_ref()
.and_then(|cfg| cfg.overlap_score_weight)
.unwrap_or(self.kv_router_config.overlap_score_weight);
for (worker_id, config) in workers
.iter()
.filter(|(wid, _)| allowed_ids.is_none_or(|ids| ids.contains(wid)))
{
let data_parallel_size = config.data_parallel_size();
for dp_rank in 0..data_parallel_size {
let worker = WorkerWithDpRank::new(*worker_id, dp_rank);
let overlap = *overlaps.get(&worker).unwrap_or(&0);
let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
let potential_prefill_block = (prefill_token as f64) / (block_size as f64);
let decode_block = *decode_blocks
.get(&worker)
.unwrap_or(&(potential_prefill_block.floor() as usize))
as f64;
let logit = overlap_weight * potential_prefill_block + decode_block;
worker_logits.insert(worker, logit);
tracing::info!(
"Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
= {overlap_weight:.1} * prefill_blocks + decode_blocks \
= {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
worker.worker_id,
worker.dp_rank
);
}
}
let temperature = request
.router_config_override
.as_ref()
.and_then(|cfg| cfg.router_temperature)
.unwrap_or(self.kv_router_config.router_temperature);
let candidates = softmax_sample(&worker_logits, temperature);
let best_worker = if candidates.len() > 1 {
tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
let tree_sizes: Vec<(usize, &WorkerWithDpRank)> = candidates
.iter()
.map(|w| (request.overlaps.tree_sizes.get(w).copied().unwrap_or(0), w))
.collect();
if tree_sizes.iter().all(|(s, _)| *s == tree_sizes[0].0) {
let idx = rand::rng().random_range(0..candidates.len());
candidates[idx]
} else {
*tree_sizes.iter().min_by_key(|(s, _)| *s).unwrap().1
}
} else {
candidates[0]
};
let best_logit = worker_logits[&best_worker];
let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
let total_blocks_info = workers
.get(&best_worker.worker_id)
.and_then(|cfg| cfg.total_kv_blocks())
.map(|blocks| format!(", total blocks: {}", blocks))
.unwrap_or_default();
let tree_size = request
.overlaps
.tree_sizes
.get(&best_worker)
.copied()
.unwrap_or(0);
tracing::info!(
"Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
best_worker.worker_id,
best_worker.dp_rank,
best_logit,
best_overlap,
tree_size,
total_blocks_info
);
Ok(WorkerSelectionResult {
worker: best_worker,
required_blocks: request_blocks as u64,
overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_softmax_sample_single_key() {
let mut logits = HashMap::new();
let worker = WorkerWithDpRank::from_worker_id(42);
logits.insert(worker, 0.5);
for temperature in &[0.1, 1.0, 10.0] {
let result = softmax_sample(&logits, *temperature);
assert_eq!(result.len(), 1, "Should return exactly one worker");
assert_eq!(result[0], worker, "Should return the only available worker");
}
logits.clear();
logits.insert(worker, -100.0);
let result = softmax_sample(&logits, 1.0);
assert_eq!(result.len(), 1);
assert_eq!(result[0], worker);
logits.clear();
logits.insert(worker, 100.0);
let result = softmax_sample(&logits, 1.0);
assert_eq!(result.len(), 1);
assert_eq!(result[0], worker);
logits.clear();
logits.insert(worker, 0.0);
let result = softmax_sample(&logits, 1.0);
assert_eq!(result.len(), 1);
assert_eq!(result[0], worker);
}
#[test]
fn test_softmax_sample_zero_temperature() {
let mut logits = HashMap::new();
let worker1 = WorkerWithDpRank::from_worker_id(1);
let worker2 = WorkerWithDpRank::from_worker_id(2);
let worker3 = WorkerWithDpRank::from_worker_id(3);
let worker4 = WorkerWithDpRank::from_worker_id(4);
logits.insert(worker1, 5.0);
logits.insert(worker2, 3.0);
logits.insert(worker3, 7.0);
logits.insert(worker4, 3.5);
let result = softmax_sample(&logits, 0.0);
assert_eq!(
result.len(),
1,
"Should return one worker when there's no tie"
);
assert_eq!(
result[0], worker2,
"Should return worker with smallest logit when temperature is 0"
);
logits.clear();
let worker5 = WorkerWithDpRank::from_worker_id(5);
let worker6 = WorkerWithDpRank::from_worker_id(6);
logits.insert(worker1, 5.0);
logits.insert(worker2, 3.0);
logits.insert(worker5, 3.0);
logits.insert(worker6, 7.0);
let result = softmax_sample(&logits, 0.0);
assert_eq!(
result.len(),
2,
"Should return all workers with smallest logit when tied"
);
assert!(
result.contains(&worker2) && result.contains(&worker5),
"Should contain both tied workers"
);
logits.clear();
let worker10 = WorkerWithDpRank::from_worker_id(10);
let worker20 = WorkerWithDpRank::from_worker_id(20);
let worker30 = WorkerWithDpRank::from_worker_id(30);
logits.insert(worker10, -1.0);
logits.insert(worker20, -5.0);
logits.insert(worker30, 0.0);
let result = softmax_sample(&logits, 0.0);
assert_eq!(result.len(), 1);
assert_eq!(
result[0], worker20,
"Should handle negative logits correctly"
);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::{HashMap, HashSet};
use dynamo_tokens::SequenceHash;
use serde::{Deserialize, Serialize};
use super::config::RouterConfigOverride;
use crate::protocols::{DpRank, OverlapScores, WorkerId, WorkerWithDpRank};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
pub worker_id: WorkerId,
pub dp_rank: DpRank,
pub potential_prefill_tokens: usize,
pub potential_decode_blocks: usize,
}
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
#[error("no endpoints available to route work")]
NoEndpoints,
#[error("endpoint subscriber shutdown")]
SubscriberShutdown,
#[error("failed to initialize event publisher: {0}")]
InitFailed(String),
}
#[derive(Debug)]
pub struct SchedulingResponse {
pub best_worker: WorkerWithDpRank,
pub overlap_blocks: u32,
}
pub struct SchedulingRequest {
pub maybe_request_id: Option<String>,
pub token_seq: Option<Vec<SequenceHash>>,
pub isl_tokens: usize,
pub overlaps: OverlapScores,
pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
pub router_config_override: Option<RouterConfigOverride>,
pub update_states: bool,
pub lora_name: Option<String>,
/// Priority jump in seconds; decreases effective arrival time in the queue.
pub priority_jump: f64,
/// Optional set of allowed worker IDs to restrict routing decisions (EPP).
pub allowed_worker_ids: Option<HashSet<WorkerId>>,
pub resp_tx: Option<tokio::sync::oneshot::Sender<Result<SchedulingResponse, KvSchedulerError>>>,
}
impl SchedulingRequest {
pub fn respond(&mut self, result: Result<SchedulingResponse, KvSchedulerError>) {
let Some(tx) = self.resp_tx.take() else {
tracing::error!("respond called multiple times on same request");
return;
};
if tx.send(result).is_err() {
tracing::error!("failed to send response to requestor");
}
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod multi_worker;
pub mod single;
pub use multi_worker::*;
pub use single::*;
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Multi-worker extension of [`ActiveSequences`] using shared DashMap for lock-free concurrent
//! access, with pluggable event publishing and metric observation via traits.
//! Multi-worker extension of [`ActiveSequences`] with per-worker `parking_lot::RwLock` for
//! fine-grained concurrent access, with pluggable event publishing and metric observation via
//! traits.
//!
//! The two traits [`SequencePublisher`] and [`SequenceSubscriber`] abstract the runtime-specific
//! transport (e.g., NATS EventPublisher, Prometheus gauges) so that all business logic lives in
......@@ -10,15 +11,16 @@
use dashmap::DashMap;
use dynamo_tokens::SequenceHash;
use parking_lot::RwLock;
use std::collections::{HashMap, HashSet};
use std::future::Future;
use std::sync::Arc;
use tokio_util::sync::CancellationToken;
use super::single::{ActiveSequences, RequestId};
use crate::protocols::{
ActiveLoad, ActiveSequenceEvent, ActiveSequenceEventData, OverlapScores, WorkerWithDpRank,
};
use crate::sequence::{ActiveSequences, RequestId};
// ---------------------------------------------------------------------------
// Traits
......@@ -91,19 +93,48 @@ pub struct SequenceRequest {
pub lora_name: Option<String>,
}
// ---------------------------------------------------------------------------
// WorkerTable
// ---------------------------------------------------------------------------
struct WorkerTable {
slots: Vec<(WorkerWithDpRank, RwLock<ActiveSequences>)>,
index: HashMap<WorkerWithDpRank, usize>,
}
impl WorkerTable {
fn new(block_size: usize, dp_sizes: &HashMap<u64, u32>) -> Self {
let mut slots = Vec::new();
let mut index = HashMap::new();
for (&worker_id, &dp_size) in dp_sizes {
for dp_rank in 0..dp_size {
let worker = WorkerWithDpRank::new(worker_id, dp_rank);
let idx = slots.len();
slots.push((worker, RwLock::new(ActiveSequences::new(block_size))));
index.insert(worker, idx);
}
}
Self { slots, index }
}
}
// ---------------------------------------------------------------------------
// ActiveSequencesMultiWorker
// ---------------------------------------------------------------------------
/// Multi-worker extension of [`ActiveSequences`] using shared DashMap for lock-free concurrent
/// access.
/// Multi-worker extension of [`ActiveSequences`] with per-worker `parking_lot::RwLock` for
/// fine-grained concurrent access.
///
/// The outer `RwLock<WorkerTable>` is held only during sync blocks (never across `.await`),
/// while each worker slot has its own `RwLock<ActiveSequences>` for per-worker fine-grained
/// locking with cache-friendly Vec layout.
///
/// Generic over `P: SequencePublisher` to decouple from runtime-specific event transport
/// and metrics infrastructure.
pub struct ActiveSequencesMultiWorker<P: SequencePublisher> {
workers: Arc<DashMap<WorkerWithDpRank, ActiveSequences>>,
request_to_worker: Arc<DashMap<RequestId, WorkerWithDpRank>>,
request_to_lora: Arc<DashMap<RequestId, String>>,
workers: RwLock<WorkerTable>,
request_to_worker: DashMap<RequestId, WorkerWithDpRank>,
request_to_lora: DashMap<RequestId, String>,
block_size: usize,
router_id: u64,
publisher: P,
......@@ -125,21 +156,10 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
) -> Self {
assert!(block_size > 1, "block_size must be greater than 1");
let workers = Arc::new(DashMap::new());
let request_to_worker = Arc::new(DashMap::new());
let request_to_lora = Arc::new(DashMap::new());
for (worker_id, dp_size) in dp_sizes {
for dp_rank in 0..dp_size {
let worker = WorkerWithDpRank::new(worker_id, dp_rank);
workers.insert(worker, ActiveSequences::new(block_size));
}
}
Self {
workers,
request_to_worker,
request_to_lora,
workers: RwLock::new(WorkerTable::new(block_size, &dp_sizes)),
request_to_worker: DashMap::new(),
request_to_lora: DashMap::new(),
block_size,
router_id,
publisher,
......@@ -202,8 +222,9 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
.insert(event.request_id.clone(), lora_name.clone());
}
if let Some(mut entry) = self.workers.get_mut(&event.worker) {
entry.add_request(
let table = self.workers.read();
if let Some(&idx) = table.index.get(&event.worker) {
table.slots[idx].1.write().add_request(
event.request_id.clone(),
token_sequence.clone(),
*isl,
......@@ -220,18 +241,25 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
ActiveSequenceEventData::Free => {
if let Some((_, worker)) =
self.request_to_worker.remove(&event.request_id)
&& let Some(mut entry) = self.workers.get_mut(&worker)
{
entry.free(&event.request_id);
let table = self.workers.read();
if let Some(&idx) = table.index.get(&worker) {
table.slots[idx].1.write().free(&event.request_id);
}
}
self.request_to_lora.remove(&event.request_id);
}
ActiveSequenceEventData::MarkPrefillCompleted => {
if let Some(worker) =
self.request_to_worker.get(&event.request_id)
&& let Some(mut entry) = self.workers.get_mut(&*worker)
{
entry.mark_prefill_completed(&event.request_id);
let worker =
self.request_to_worker.get(&event.request_id).map(|r| *r);
if let Some(worker) = worker {
let table = self.workers.read();
if let Some(&idx) = table.index.get(&worker) {
table.slots[idx]
.1
.write()
.mark_prefill_completed(&event.request_id);
}
}
}
}
......@@ -249,27 +277,23 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
/// Update the set of workers, adding and removing as needed.
///
/// `new_dp_sizes` maps worker IDs to their data-parallel size.
pub fn update_workers(&self, new_dp_sizes: HashMap<u64, u32>) {
let current_workers: HashSet<WorkerWithDpRank> =
self.workers.iter().map(|entry| *entry.key()).collect();
pub fn update_workers(&self, new_dp_sizes: &HashMap<u64, u32>) {
let mut table = self.workers.write();
let mut new_workers: HashSet<WorkerWithDpRank> = HashSet::new();
for (worker_id, dp_size) in &new_dp_sizes {
for dp_rank in 0..*dp_size {
new_workers.insert(WorkerWithDpRank::new(*worker_id, dp_rank));
let mut target_workers: HashSet<WorkerWithDpRank> = HashSet::new();
for (&worker_id, &dp_size) in new_dp_sizes {
for dp_rank in 0..dp_size {
target_workers.insert(WorkerWithDpRank::new(worker_id, dp_rank));
}
}
let workers_to_remove: Vec<WorkerWithDpRank> =
current_workers.difference(&new_workers).copied().collect();
let workers_to_add: Vec<WorkerWithDpRank> =
new_workers.difference(&current_workers).copied().collect();
for worker in &workers_to_remove {
// Clean up request mappings for workers being removed.
for (worker, _) in &table.slots {
if target_workers.contains(worker) {
continue;
}
tracing::warn!("Removing worker {:?}", worker);
self.workers.remove(worker);
let requests_to_remove: Vec<RequestId> = self
.request_to_worker
.iter()
......@@ -285,10 +309,25 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
}
}
for worker in &workers_to_add {
// Drain old slots, preserving ActiveSequences for retained workers.
let mut old: HashMap<WorkerWithDpRank, ActiveSequences> = table
.slots
.drain(..)
.map(|(w, lock)| (w, lock.into_inner()))
.collect();
table.index.clear();
// Rebuild with target workers, reusing state where possible.
for worker in target_workers {
if !old.contains_key(&worker) {
tracing::warn!("Adding worker {:?}", worker);
self.workers
.insert(*worker, ActiveSequences::new(self.block_size));
}
let idx = table.slots.len();
let seq = old
.remove(&worker)
.unwrap_or_else(|| ActiveSequences::new(self.block_size));
table.slots.push((worker, RwLock::new(seq)));
table.index.insert(worker, idx);
}
}
......@@ -303,7 +342,7 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
lora_name,
} = req;
if !self.workers.contains_key(&worker) {
if !self.workers.read().index.contains_key(&worker) {
return Err(SequenceError::WorkerNotFound { worker });
}
......@@ -337,11 +376,13 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
}
let removed_requests = {
let mut entry = self
.workers
.get_mut(&worker)
let table = self.workers.read();
let &idx = table
.index
.get(&worker)
.ok_or(SequenceError::WorkerNotFound { worker })?;
entry.add_request(
let mut seq = table.slots[idx].1.write();
seq.add_request(
request_id,
token_sequence,
isl,
......@@ -394,11 +435,13 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
}
{
let mut entry = self
.workers
.get_mut(&worker)
let table = self.workers.read();
let &idx = table
.index
.get(&worker)
.ok_or(SequenceError::WorkerNotFound { worker })?;
mutate_fn(&mut entry, request_id);
let mut seq = table.slots[idx].1.write();
mutate_fn(&mut seq, request_id);
}
if remove_mapping {
......@@ -471,11 +514,13 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
})?;
let success = {
let mut entry = self
.workers
.get_mut(&worker)
let table = self.workers.read();
let &idx = table
.index
.get(&worker)
.ok_or(SequenceError::WorkerNotFound { worker })?;
entry.add_output_block(request_id, decay_fraction)
let mut seq = table.slots[idx].1.write();
seq.add_output_block(request_id, decay_fraction)
};
if !success {
......@@ -492,11 +537,13 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
/// Read active blocks/tokens from a worker and publish ActiveLoad metrics.
fn publish_active_load_for_worker(&self, worker: WorkerWithDpRank) {
let (active_blocks, active_tokens) = {
let Some(entry) = self.workers.get(&worker) else {
let table = self.workers.read();
let Some(&idx) = table.index.get(&worker) else {
tracing::warn!("Worker {worker:?} not found when publishing ActiveLoad");
return;
};
(entry.active_blocks(), entry.active_tokens())
let seq = table.slots[idx].1.read();
(seq.active_blocks(), seq.active_tokens())
};
self.publisher
......@@ -514,7 +561,7 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
/// Get the number of workers.
pub fn num_workers(&self) -> usize {
self.workers.len()
self.workers.read().slots.len()
}
/// Get the worker type for this router ("prefill" or "decode").
......@@ -523,13 +570,11 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
}
/// Query all workers for the number of new blocks that would be added by a token sequence.
pub fn new_blocks(
&self,
token_sequence: Vec<SequenceHash>,
) -> HashMap<WorkerWithDpRank, usize> {
let mut results = HashMap::with_capacity(self.workers.len());
for entry in self.workers.iter() {
results.insert(*entry.key(), entry.value().new_blocks(&token_sequence));
pub fn new_blocks(&self, token_sequence: &[SequenceHash]) -> HashMap<WorkerWithDpRank, usize> {
let table = self.workers.read();
let mut results = HashMap::with_capacity(table.slots.len());
for (worker, lock) in &table.slots {
results.insert(*worker, lock.read().new_blocks(token_sequence));
}
results
}
......@@ -537,14 +582,12 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
/// Query all workers for the total number of blocks (new + active) that would be used.
pub fn potential_blocks(
&self,
token_sequence: Vec<SequenceHash>,
token_sequence: &[SequenceHash],
) -> HashMap<WorkerWithDpRank, usize> {
let mut results = HashMap::with_capacity(self.workers.len());
for entry in self.workers.iter() {
results.insert(
*entry.key(),
entry.value().potential_blocks(&token_sequence),
);
let table = self.workers.read();
let mut results = HashMap::with_capacity(table.slots.len());
for (worker, lock) in &table.slots {
results.insert(*worker, lock.read().potential_blocks(token_sequence));
}
results
}
......@@ -552,7 +595,7 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
/// Query all workers for the potential blocks and tokens.
pub fn potential_blocks_and_tokens(
&self,
token_sequence: Option<Vec<SequenceHash>>,
token_sequence: Option<&[SequenceHash]>,
isl: usize,
overlaps: OverlapScores,
) -> (
......@@ -561,22 +604,23 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
) {
#[cfg(feature = "bench")]
let start = tokio::time::Instant::now();
let table = self.workers.read();
#[cfg(feature = "bench")]
let num_workers = self.workers.len();
let num_workers = table.slots.len();
let mut potential_blocks = HashMap::with_capacity(self.workers.len());
let mut potential_tokens = HashMap::with_capacity(self.workers.len());
let mut potential_blocks = HashMap::with_capacity(table.slots.len());
let mut potential_tokens = HashMap::with_capacity(table.slots.len());
for entry in self.workers.iter() {
let worker = *entry.key();
let overlap = *overlaps.scores.get(&worker).unwrap_or(&0);
for (worker, lock) in &table.slots {
let overlap = *overlaps.scores.get(worker).unwrap_or(&0);
let (blocks, tokens) =
entry
.value()
.potential_blocks_and_tokens(token_sequence.as_deref(), isl, overlap);
potential_blocks.insert(worker, blocks);
potential_tokens.insert(worker, tokens);
lock.read()
.potential_blocks_and_tokens(token_sequence, isl, overlap);
potential_blocks.insert(*worker, blocks);
potential_tokens.insert(*worker, tokens);
}
#[cfg(feature = "bench")]
......@@ -594,18 +638,20 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
/// Query all workers for their current number of active blocks.
pub fn active_blocks(&self) -> HashMap<WorkerWithDpRank, usize> {
let mut results = HashMap::with_capacity(self.workers.len());
for entry in self.workers.iter() {
results.insert(*entry.key(), entry.value().active_blocks());
let table = self.workers.read();
let mut results = HashMap::with_capacity(table.slots.len());
for (worker, lock) in &table.slots {
results.insert(*worker, lock.read().active_blocks());
}
results
}
/// Query all workers for their current number of active tokens.
pub fn active_tokens(&self) -> HashMap<WorkerWithDpRank, usize> {
let mut results = HashMap::with_capacity(self.workers.len());
for entry in self.workers.iter() {
results.insert(*entry.key(), entry.value().active_tokens());
let table = self.workers.read();
let mut results = HashMap::with_capacity(table.slots.len());
for (worker, lock) in &table.slots {
results.insert(*worker, lock.read().active_tokens());
}
results
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment