feat: add Rayon compute pool for CPU-intensive operations (#2969)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: add Rayon compute pool for CPU-intensive operations (#2969)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
a13c4cb6 · Ryan Olson · GitHub · 7ebbd001 · a13c4cb6 · a13c4cb6
Unverified Commit a13c4cb6 authored Sep 26, 2025 by Ryan Olson Committed by GitHub Sep 26, 2025
19 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -71,12 +71,6 @@ version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"

-[[package]]
-name = "android-tzdata"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -86,6 +80,12 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
 [[package]]
 name = "anstream"
 version = "0.6.20"
@@ -138,9 +138,9 @@ dependencies = [

 [[package]]
 name = "anyhow"
-version = "1.0.99"
+version = "1.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100"
+checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
 dependencies = [
 "backtrace",
 ]
@@ -386,9 +386,9 @@ dependencies = [

 [[package]]
 name = "aws-lc-rs"
-version = "1.13.3"
+version = "1.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba"
+checksum = "94b8ff6c09cd57b16da53641caa860168b88c172a5ee163b0288d3d6eea12786"
 dependencies = [
 "aws-lc-sys",
 "zeroize",
@@ -396,11 +396,11 @@ dependencies = [

 [[package]]
 name = "aws-lc-sys"
-version = "0.30.0"
+version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff"
+checksum = "0e44d16778acaf6a9ec9899b92cebd65580b83f685446bf2e1f5d3d732f99dcd"
 dependencies = [
- "bindgen 0.69.5",
+ "bindgen 0.72.1",
 "cc",
 "cmake",
 "dunce",
@@ -604,36 +604,13 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "bindgen"
-version = "0.69.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
-dependencies = [
- "bitflags 2.9.3",
- "cexpr",
- "clang-sys",
- "itertools 0.12.1",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash 1.1.0",
- "shlex",
- "syn 2.0.106",
- "which",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.71.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "cexpr",
 "clang-sys",
 "itertools 0.13.0",
@@ -649,11 +626,11 @@ dependencies = [

 [[package]]
 name = "bindgen"
-version = "0.72.0"
+version = "0.72.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f72209734318d0b619a5e0f5129918b848c416e122a3c4ce054e03cb87b726f"
+checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "cexpr",
 "clang-sys",
 "itertools 0.13.0",
@@ -732,9 +709,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"

 [[package]]
 name = "bitflags"
-version = "2.9.3"
+version = "2.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d"
+checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394"
 dependencies = [
 "serde",
 ]
@@ -777,9 +754,9 @@ dependencies = [

 [[package]]
 name = "bm25"
-version = "2.3.1"
+version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b84ff0d57042bc263e2ebadb3703424b59b65870902649a2b3d0f4d7ab863244"
+checksum = "1cbd8ffdfb7b4c2ff038726178a780a94f90525ed0ad264c0afaa75dd8c18a64"
 dependencies = [
 "cached",
 "deunicode",
@@ -1010,9 +987,9 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb"
 dependencies = [
- "clap 4.5.46",
+ "clap 4.5.48",
 "heck 0.4.1",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "log",
 "proc-macro2",
 "quote",
@@ -1025,10 +1002,11 @@ dependencies = [

 [[package]]
 name = "cc"
-version = "1.2.34"
+version = "1.2.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42bc4aea80032b7bf409b0bc7ccad88853858911b7713a8062fdc0623867bedc"
+checksum = "80f41ae168f955c12fb8960b057d70d0ca153fb83182b57d86380443527be7e9"
 dependencies = [
+ "find-msvc-tools",
 "jobserver",
 "libc",
 "shlex",
@@ -1077,7 +1055,7 @@ version = "0.13.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7fe45e18904af7af10e4312df7c97251e98af98c70f42f1f2587aecfcbee56bf"
 dependencies = [
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "lazy_static",
 "num-traits",
 "regex",
@@ -1087,17 +1065,43 @@ dependencies = [

 [[package]]
 name = "chrono"
-version = "0.4.41"
+version = "0.4.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d"
+checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2"
 dependencies = [
- "android-tzdata",
 "iana-time-zone",
 "js-sys",
 "num-traits",
 "serde",
 "wasm-bindgen",
- "windows-link",
+ "windows-link 0.2.0",
+]
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half 2.6.0",
 ]

 [[package]]
@@ -1124,9 +1128,9 @@ dependencies = [

 [[package]]
 name = "clap"
-version = "4.5.46"
+version = "4.5.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c5e4fcf9c21d2e544ca1ee9d8552de13019a42aa7dbf32747fa7aaf1df76e57"
+checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae"
 dependencies = [
 "clap_builder",
 "clap_derive",
@@ -1134,9 +1138,9 @@ dependencies = [

 [[package]]
 name = "clap_builder"
-version = "4.5.46"
+version = "4.5.48"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fecb53a0e6fcfb055f686001bc2e2592fa527efaf38dbe81a6a9563562e57d41"
+checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9"
 dependencies = [
 "anstream",
 "anstyle",
@@ -1147,9 +1151,9 @@ dependencies = [

 [[package]]
 name = "clap_derive"
-version = "4.5.45"
+version = "4.5.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14cb31bb0a7d536caef2639baa7fad459e15c3144efefa6dbd1c84562c4739f6"
+checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -1210,9 +1214,9 @@ dependencies = [

 [[package]]
 name = "config"
-version = "0.15.15"
+version = "0.15.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0faa974509d38b33ff89282db9c3295707ccf031727c0de9772038ec526852ba"
+checksum = "680d3ac2fe066c43300ec831c978871e50113a708d58ab13d231bd92deca5adb"
 dependencies = [
 "async-trait",
 "convert_case",
@@ -1220,10 +1224,10 @@ dependencies = [
 "pathdiff",
 "ron",
 "rust-ini",
- "serde",
 "serde-untagged",
+ "serde_core",
 "serde_json",
- "toml 0.9.5",
+ "toml 0.9.7",
 "winnow",
 "yaml-rust2",
 ]
@@ -1385,7 +1389,7 @@ dependencies = [
 "atty",
 "cast",
 "clap 2.34.0",
- "criterion-plot",
+ "criterion-plot 0.4.5",
 "csv",
 "itertools 0.10.5",
 "lazy_static",
@@ -1402,6 +1406,34 @@ dependencies = [
 "walkdir",
 ]

+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap 4.5.48",
+ "criterion-plot 0.5.0",
+ "futures",
+ "is-terminal",
+ "itertools 0.10.5",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "tokio",
+ "walkdir",
+]
+
 [[package]]
 name = "criterion-plot"
 version = "0.4.5"
@@ -1412,6 +1444,16 @@ dependencies = [
 "itertools 0.10.5",
 ]

+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools 0.10.5",
+]
+
 [[package]]
 name = "crossbeam"
 version = "0.8.4"
@@ -1565,9 +1607,9 @@ dependencies = [

 [[package]]
 name = "cudarc"
-version = "0.17.2"
+version = "0.17.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8147ca46109d41cc513fd629b52bbea9bd09b972034c2f32954ce84a92895a91"
+checksum = "72ba848ae5c6f3cb36e71eab5f268763e3fabcabe3f7bc683e16f7fa3d46281e"
 dependencies = [
 "libloading",
 ]
@@ -1618,6 +1660,16 @@ dependencies = [
 "darling_macro 0.20.11",
 ]

+[[package]]
+name = "darling"
+version = "0.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0"
+dependencies = [
+ "darling_core 0.21.3",
+ "darling_macro 0.21.3",
+]
+
 [[package]]
 name = "darling_core"
 version = "0.11.0"
@@ -1646,6 +1698,20 @@ dependencies = [
 "syn 2.0.106",
 ]

+[[package]]
+name = "darling_core"
+version = "0.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim 0.11.1",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "darling_macro"
 version = "0.11.0"
@@ -1668,11 +1734,22 @@ dependencies = [
 "syn 2.0.106",
 ]

+[[package]]
+name = "darling_macro"
+version = "0.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81"
+dependencies = [
+ "darling_core 0.21.3",
+ "quote",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "dary_heap"
-version = "0.3.7"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728"
+checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
 dependencies = [
 "serde",
 ]
@@ -1715,12 +1792,12 @@ dependencies = [

 [[package]]
 name = "deranged"
-version = "0.4.0"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e"
+checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071"
 dependencies = [
 "powerfmt",
- "serde",
+ "serde_core",
 ]

 [[package]]
@@ -1919,7 +1996,7 @@ dependencies = [
 "libc",
 "option-ext",
 "redox_users",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.0",
 ]

 [[package]]
@@ -2019,7 +2096,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "url",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -2045,7 +2122,7 @@ dependencies = [
 "dynamo-llm",
 "dynamo-runtime",
 "either",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "mistralrs",
 "serde_json",
 "tokio",
@@ -2069,15 +2146,15 @@ dependencies = [
 "async_zmq",
 "axum 0.8.4",
 "axum-server",
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "blake3",
 "bs62",
 "bytemuck",
 "bytes",
 "candle-core 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "chrono",
- "criterion",
- "cudarc 0.17.2",
+ "criterion 0.3.6",
+ "cudarc 0.17.3",
 "dashmap",
 "derive-getters",
 "derive_builder",
@@ -2140,7 +2217,7 @@ dependencies = [
 "tracing",
 "unicode-segmentation",
 "url",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 "validator",
 "xxhash-rust",
 "zeromq",
@@ -2161,7 +2238,7 @@ dependencies = [
 "serde_json",
 "tokio",
 "tracing",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -2171,7 +2248,7 @@ dependencies = [
 "anyhow",
 "async-stream",
 "async-trait",
- "clap 4.5.46",
+ "clap 4.5.48",
 "dynamo-async-openai",
 "dynamo-engine-llamacpp",
 "dynamo-engine-mistralrs",
@@ -2189,7 +2266,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 "vergen-gitcl",
 ]

@@ -2211,6 +2288,7 @@ dependencies = [
 "bytes",
 "chrono",
 "console-subscriber",
+ "criterion 0.5.1",
 "derive-getters",
 "derive_builder",
 "educe",
@@ -2229,6 +2307,7 @@ dependencies = [
 "once_cell",
 "prometheus",
 "rand 0.9.2",
+ "rayon",
 "regex",
 "reqwest 0.12.23",
 "rstest 0.23.0",
@@ -2240,13 +2319,14 @@ dependencies = [
 "tempfile",
 "thiserror 2.0.16",
 "tokio",
+ "tokio-rayon",
 "tokio-stream",
 "tokio-util",
 "tower-http",
 "tracing",
 "tracing-subscriber",
 "url",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 "validator",
 "xxhash-rust",
 ]
@@ -2434,22 +2514,23 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"

 [[package]]
 name = "erased-serde"
-version = "0.4.6"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e004d887f51fcb9fef17317a2f3525c887d8aa3f4f50fed920816a688284a5b7"
+checksum = "259d404d09818dec19332e31d94558aeb442fea04c817006456c24b5460bbd4b"
 dependencies = [
 "serde",
+ "serde_core",
 "typeid",
 ]

 [[package]]
 name = "errno"
-version = "0.3.13"
+version = "0.3.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
 "libc",
- "windows-sys 0.60.2",
+ "windows-sys 0.61.0",
 ]

 [[package]]
@@ -2547,6 +2628,26 @@ version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"

+[[package]]
+name = "fax"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f05de7d48f37cd6730705cbca900770cab77a89f413d23e100ad7fad7795a0ab"
+dependencies = [
+ "fax_derive",
+]
+
+[[package]]
+name = "fax_derive"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0aca10fb742cb43f9e7bb8467c91aa9bcb8e3ffbc6a6f7389bb93ffc920577d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "fdeflate"
 version = "0.3.7"
@@ -2579,6 +2680,12 @@ dependencies = [
 "version_check",
 ]

+[[package]]
+name = "find-msvc-tools"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ced73b1dacfc750a6db6c0a0c3a3853c8b41997e2e2c563dc90804ae6867959"
+
 [[package]]
 name = "find_cuda_helper"
 version = "0.2.0"
@@ -2690,9 +2797,9 @@ dependencies = [

 [[package]]
 name = "fs-err"
-version = "3.1.1"
+version = "3.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88d7be93788013f265201256d58f04936a8079ad5dc898743aa20525f503b683"
+checksum = "44f150ffc8782f35521cec2b23727707cb4045706ba3c854e86bef66b3a8cdbd"
 dependencies = [
 "autocfg",
 "tokio",
@@ -2880,7 +2987,7 @@ dependencies = [
 "num-complex",
 "num-traits",
 "paste",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "seq-macro",
 ]

@@ -2910,7 +3017,7 @@ dependencies = [
 "num-complex",
 "num-traits",
 "paste",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "seq-macro",
 ]

@@ -2940,7 +3047,7 @@ dependencies = [
 "num-complex",
 "num-traits",
 "paste",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "seq-macro",
 ]

@@ -2979,7 +3086,7 @@ dependencies = [
 "once_cell",
 "paste",
 "pulp 0.21.5",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "rayon",
 "seq-macro",
 "sysctl 0.6.0",
@@ -3016,7 +3123,7 @@ dependencies = [
 "num-complex",
 "num-traits",
 "paste",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "rayon",
 "seq-macro",
 ]
@@ -3047,7 +3154,7 @@ dependencies = [
 "num-complex",
 "num-traits",
 "paste",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "seq-macro",
 ]

@@ -3077,7 +3184,7 @@ dependencies = [
 "num-complex",
 "num-traits",
 "paste",
- "raw-cpuid 11.5.0",
+ "raw-cpuid 11.6.0",
 "seq-macro",
 ]

@@ -3093,9 +3200,9 @@ dependencies = [

 [[package]]
 name = "getopts"
-version = "0.2.23"
+version = "0.2.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cba6ae63eb948698e300f645f87c70f76630d505f23b8907cf1e193ee85048c1"
+checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
 dependencies = [
 "unicode-width 0.2.1",
 ]
@@ -3123,7 +3230,7 @@ dependencies = [
 "js-sys",
 "libc",
 "r-efi",
- "wasi 0.14.3+wasi-0.2.4",
+ "wasi 0.14.7+wasi-0.2.4",
 "wasm-bindgen",
 ]

@@ -3146,7 +3253,7 @@ checksum = "3ac5654356c6f7f6116905aeaf92ab002c3d03414ada5dbe0bb2e32aa5fea173"
 dependencies = [
 "fancy-regex 0.14.0",
 "ggml-quants",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "log",
 "num_enum",
 ]
@@ -3198,7 +3305,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.12",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "slab",
 "tokio",
 "tokio-util",
@@ -3217,7 +3324,7 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http 1.3.1",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "slab",
 "tokio",
 "tokio-util",
@@ -3270,6 +3377,12 @@ dependencies = [
 "foldhash",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
+
 [[package]]
 name = "hashlink"
 version = "0.10.0"
@@ -3348,15 +3461,6 @@ dependencies = [
 "windows-sys 0.60.2",
 ]

-[[package]]
-name = "home"
-version = "0.5.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf"
-dependencies = [
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "hound"
 version = "3.5.1"
@@ -3469,9 +3573,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"

 [[package]]
 name = "humantime"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f"
+checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424"

 [[package]]
 name = "hyper"
@@ -3569,9 +3673,9 @@ dependencies = [

 [[package]]
 name = "hyper-util"
-version = "0.1.16"
+version = "0.1.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e"
+checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8"
 dependencies = [
 "base64 0.22.1",
 "bytes",
@@ -3595,9 +3699,9 @@ dependencies = [

 [[package]]
 name = "iana-time-zone"
-version = "0.1.63"
+version = "0.1.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8"
+checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb"
 dependencies = [
 "android_system_properties",
 "core-foundation-sys",
@@ -3605,7 +3709,7 @@ dependencies = [
 "js-sys",
 "log",
 "wasm-bindgen",
- "windows-core 0.61.2",
+ "windows-core 0.62.0",
 ]

 [[package]]
@@ -3732,9 +3836,9 @@ dependencies = [

 [[package]]
 name = "image"
-version = "0.25.6"
+version = "0.25.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db35664ce6b9810857a38a906215e75a9c879f0696556a39f59c62829710251a"
+checksum = "529feb3e6769d234375c4cf1ee2ce713682b8e76538cb13f9fc23e1400a591e7"
 dependencies = [
 "bytemuck",
 "byteorder-lite",
@@ -3742,6 +3846,7 @@ dependencies = [
 "exr",
 "gif",
 "image-webp",
+ "moxcms",
 "num-traits",
 "png",
 "qoi",
@@ -3765,9 +3870,9 @@ dependencies = [

 [[package]]
 name = "imgref"
-version = "1.11.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408"
+checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"

 [[package]]
 name = "indexmap"
@@ -3782,13 +3887,14 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.11.0"
+version = "2.11.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9"
+checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5"
 dependencies = [
 "equivalent",
- "hashbrown 0.15.5",
+ "hashbrown 0.16.0",
 "serde",
+ "serde_core",
 ]

 [[package]]
@@ -3813,9 +3919,9 @@ checksum = "c8fae54786f62fb2918dcfae3d568594e50eb9b5c25bf04371af6fe7516452fb"

 [[package]]
 name = "insta"
-version = "1.43.1"
+version = "1.43.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "154934ea70c58054b556dd430b99a98c2a7ff5309ac9891597e339b5c28f4371"
+checksum = "46fdb647ebde000f43b5b53f773c30cf9b0cb4300453208713fa38b2c70935a0"
 dependencies = [
 "console",
 "globset",
@@ -3867,7 +3973,7 @@ version = "0.7.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "cfg-if 1.0.3",
 "libc",
 ]
@@ -3909,6 +4015,17 @@ dependencies = [
 "syn 2.0.106",
 ]

+[[package]]
+name = "is-terminal"
+version = "0.4.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
+dependencies = [
+ "hermit-abi 0.5.2",
+ "libc",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.1"
@@ -4026,17 +4143,11 @@ dependencies = [
 "libc",
 ]

-[[package]]
-name = "jpeg-decoder"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07"
-
 [[package]]
 name = "js-sys"
-version = "0.3.77"
+version = "0.3.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
+checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305"
 dependencies = [
 "once_cell",
 "wasm-bindgen",
@@ -4073,7 +4184,7 @@ dependencies = [
 "anyhow",
 "base64 0.21.7",
 "bytecount",
- "clap 4.5.46",
+ "clap 4.5.48",
 "fancy-regex 0.11.0",
 "fraction",
 "getrandom 0.2.16",
@@ -4090,7 +4201,7 @@ dependencies = [
 "serde_json",
 "time",
 "url",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -4125,23 +4236,17 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"

-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "lebe"
-version = "0.5.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
+checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"

 [[package]]
 name = "libc"
-version = "0.2.175"
+version = "0.2.176"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543"
+checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174"

 [[package]]
 name = "libdynamo_llm"
@@ -4161,7 +4266,7 @@ dependencies = [
 "tokio-stream",
 "tracing",
 "tracing-subscriber",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -4176,12 +4281,12 @@ dependencies = [

 [[package]]
 name = "libloading"
-version = "0.8.8"
+version = "0.8.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
 dependencies = [
 "cfg-if 1.0.3",
- "windows-targets 0.53.3",
+ "windows-link 0.2.0",
 ]

 [[package]]
@@ -4192,25 +4297,19 @@ checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de"

 [[package]]
 name = "libredox"
-version = "0.1.9"
+version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3"
+checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "libc",
 ]

 [[package]]
 name = "linux-raw-sys"
-version = "0.4.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
-
-[[package]]
-name = "linux-raw-sys"
-version = "0.9.4"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
+checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"

 [[package]]
 name = "litemap"
@@ -4220,9 +4319,9 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"

 [[package]]
 name = "llama-cpp-2"
-version = "0.1.118"
+version = "0.1.122"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "108004dde2928f96a3e515c8d8870f4fdef6a283fde6385bf0991993c7dde14b"
+checksum = "d574d1f43b31c9e3d0e3bf596a31fa628e4b66894eb5db4dfe78e861c7f74275"
 dependencies = [
 "enumflags2",
 "llama-cpp-sys-2",
@@ -4233,11 +4332,11 @@ dependencies = [

 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.118"
+version = "0.1.122"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c070e35e0cb58e7b4dfb7cd04a1cc7686ab0f04355076f2d5cdc284a1d8a0cf"
+checksum = "e09bdf53b6f486ecaeb96b08cd8a9d9df162f2aafa37efb5b40cf421a419c755"
 dependencies = [
- "bindgen 0.72.0",
+ "bindgen 0.72.1",
 "cc",
 "cmake",
 "find_cuda_helper",
@@ -4252,7 +4351,7 @@ source = "git+https://github.com/guidance-ai/llguidance.git?rev=c432092#c432092d
 dependencies = [
 "anyhow",
 "derivre",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "regex-syntax",
 "serde",
 "serde_json",
@@ -4283,9 +4382,9 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.27"
+version = "0.4.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432"

 [[package]]
 name = "loop9"
@@ -4483,9 +4582,9 @@ dependencies = [

 [[package]]
 name = "memchr"
-version = "2.7.5"
+version = "2.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
+checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"

 [[package]]
 name = "memmap2"
@@ -4518,7 +4617,7 @@ version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c43f73953f8cbe511f021b58f18c3ce1c3d1ae13fe953293e13345bf83217f25"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "block",
 "core-graphics-types",
 "foreign-types 0.5.0",
@@ -4533,7 +4632,7 @@ version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ecfd3296f8c56b7c1f6fbac3c71cefa9d78ce009850c45000015f206dc7fa21"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "block",
 "core-graphics-types",
 "foreign-types 0.5.0",
@@ -4547,7 +4646,7 @@ name = "metrics"
 version = "0.5.1"
 dependencies = [
 "axum 0.8.4",
- "clap 4.5.46",
+ "clap 4.5.48",
 "dynamo-llm",
 "dynamo-runtime",
 "futures",
@@ -4677,11 +4776,11 @@ dependencies = [
 "anyhow",
 "candle-core 0.9.1 (git+https://github.com/EricLBuehler/candle.git?rev=95d713f9)",
 "candle-nn",
- "clap 4.5.46",
+ "clap 4.5.48",
 "either",
 "futures",
 "image",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "mistralrs-core",
 "rand 0.9.2",
 "reqwest 0.12.23",
@@ -4722,7 +4821,7 @@ dependencies = [
 "candle-nn",
 "cfgrammar",
 "chrono",
- "clap 4.5.46",
+ "clap 4.5.48",
 "csv",
 "derive-new",
 "derive_more 2.0.1",
@@ -4738,7 +4837,7 @@ dependencies = [
 "html2text",
 "http 1.3.1",
 "image",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "indicatif",
 "interprocess",
 "itertools 0.14.0",
@@ -4793,7 +4892,7 @@ dependencies = [
 "tracing",
 "tracing-subscriber",
 "urlencoding",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 "variantly",
 "vob",
 ]
@@ -4815,7 +4914,7 @@ dependencies = [
 "tokio-tungstenite 0.24.0",
 "tracing",
 "utoipa",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -4877,7 +4976,7 @@ version = "0.1.0"
 source = "git+https://github.com/ai-dynamo/modelexpress.git?rev=a232220bf268a475d293914d407f4ae186f443e3#a232220bf268a475d293914d407f4ae186f443e3"
 dependencies = [
 "anyhow",
- "clap 4.5.46",
+ "clap 4.5.48",
 "colored",
 "futures",
 "modelexpress-common",
@@ -4889,7 +4988,7 @@ dependencies = [
 "tonic 0.13.1",
 "tracing",
 "tracing-subscriber",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -4900,7 +4999,7 @@ dependencies = [
 "anyhow",
 "async-trait",
 "chrono",
- "clap 4.5.46",
+ "clap 4.5.48",
 "config",
 "hf-hub",
 "jiff",
@@ -4917,25 +5016,36 @@ dependencies = [

 [[package]]
 name = "monostate"
-version = "0.1.14"
+version = "0.1.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe1be9d0c75642e3e50fedc7ecadf1ef1cbce6eb66462153fc44245343fbee"
+checksum = "1a2901b7478a273256ce419c446289eb3c883a790d0bf5ed2f363c0cc3988012"
 dependencies = [
 "monostate-impl",
 "serde",
+ "serde_core",
 ]

 [[package]]
 name = "monostate-impl"
-version = "0.1.14"
+version = "0.1.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5"
+checksum = "328f7435b7f54322b33832f1e981ff192781f0d12473f12d062705a55015de8d"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.106",
 ]

+[[package]]
+name = "moxcms"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd32fa8935aeadb8a8a6b6b351e40225570a37c43de67690383d87ef170cd08"
+dependencies = [
+ "num-traits",
+ "pxfm",
+]
+
 [[package]]
 name = "multimap"
 version = "0.10.1"
@@ -5072,7 +5182,7 @@ version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "cfg-if 1.0.3",
 "cfg_aliases",
 "libc",
@@ -5374,7 +5484,7 @@ version = "6.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "libc",
 "once_cell",
 "onig_sys",
@@ -5405,7 +5515,7 @@ dependencies = [
 "anyhow",
 "base64 0.22.1",
 "bstr",
- "clap 4.5.46",
+ "clap 4.5.48",
 "fancy-regex 0.13.0",
 "futures",
 "image",
@@ -5426,7 +5536,7 @@ version = "0.10.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "cfg-if 1.0.3",
 "foreign-types 0.3.2",
 "libc",
@@ -5596,9 +5706,9 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"

 [[package]]
 name = "pest"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1db05f56d34358a8b1066f67cbb203ee3e7ed2ba674a6263a1d5ec6db2204323"
+checksum = "21e0a3a33733faeaf8651dfee72dd0f388f0c8e5ad496a3478fa5a922f49cfa8"
 dependencies = [
 "memchr",
 "thiserror 2.0.16",
@@ -5607,9 +5717,9 @@ dependencies = [

 [[package]]
 name = "pest_derive"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb056d9e8ea77922845ec74a1c4e8fb17e7c218cc4fc11a15c5d25e189aa40bc"
+checksum = "bc58706f770acb1dbd0973e6530a3cff4746fb721207feb3a8a6064cd0b6c663"
 dependencies = [
 "pest",
 "pest_generator",
@@ -5617,9 +5727,9 @@ dependencies = [

 [[package]]
 name = "pest_generator"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87e404e638f781eb3202dc82db6760c8ae8a1eeef7fb3fa8264b2ef280504966"
+checksum = "6d4f36811dfe07f7b8573462465d5cb8965fffc2e71ae377a33aecf14c2c9a2f"
 dependencies = [
 "pest",
 "pest_meta",
@@ -5630,9 +5740,9 @@ dependencies = [

 [[package]]
 name = "pest_meta"
-version = "2.8.1"
+version = "2.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edd1101f170f5903fde0914f899bb503d9ff5271d7ba76bbb70bea63690cc0d5"
+checksum = "42919b05089acbd0a5dcd5405fb304d17d1053847b81163d09c4ad18ce8e8420"
 dependencies = [
 "pest",
 "sha2",
@@ -5645,7 +5755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772"
 dependencies = [
 "fixedbitset",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 ]

 [[package]]
@@ -5750,12 +5860,12 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"

 [[package]]
 name = "plist"
-version = "1.7.4"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3af6b589e163c5a788fab00ce0c0366f6efbb9959c2f9874b224936af7fce7e1"
+checksum = "740ebea15c5d1428f910cd1a5f52cebf8d25006245ed8ade92702f4943d91e07"
 dependencies = [
 "base64 0.22.1",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "quick-xml",
 "serde",
 "time",
@@ -5791,11 +5901,11 @@ dependencies = [

 [[package]]
 name = "png"
-version = "0.17.16"
+version = "0.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526"
+checksum = "97baced388464909d42d89643fe4361939af9b7ce7a31ee32a168f832a70f2a0"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.9.4",
 "crc32fast",
 "fdeflate",
 "flate2",
@@ -5868,11 +5978,11 @@ dependencies = [

 [[package]]
 name = "proc-macro-crate"
-version = "3.3.0"
+version = "3.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
+checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983"
 dependencies = [
- "toml_edit",
+ "toml_edit 0.23.6",
 ]

 [[package]]
@@ -5955,13 +6065,13 @@ dependencies = [

 [[package]]
 name = "proptest"
-version = "1.7.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6fcdab19deb5195a31cf7726a210015ff1496ba1464fd42cb4f537b8b01b471f"
+checksum = "2bb0be07becd10686a0bb407298fb425360a5c44a663774406340c59a22de4ce"
 dependencies = [
 "bit-set 0.8.0",
 "bit-vec 0.8.0",
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "lazy_static",
 "num-traits",
 "rand 0.9.2",
@@ -6071,6 +6181,15 @@ dependencies = [
 "version_check",
 ]

+[[package]]
+name = "pxfm"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83f9b339b02259ada5c0f4a389b7fb472f933aa17ce176fd2ad98f28bb401fde"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "qoi"
 version = "0.4.1"
@@ -6339,11 +6458,11 @@ dependencies = [

 [[package]]
 name = "raw-cpuid"
-version = "11.5.0"
+version = "11.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146"
+checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 ]

 [[package]]
@@ -6410,7 +6529,7 @@ version = "0.5.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 ]

 [[package]]
@@ -6446,9 +6565,9 @@ dependencies = [

 [[package]]
 name = "regex"
-version = "1.11.2"
+version = "1.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912"
+checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c"
 dependencies = [
 "aho-corasick",
 "memchr",
@@ -6458,9 +6577,9 @@ dependencies = [

 [[package]]
 name = "regex-automata"
-version = "0.4.10"
+version = "0.4.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6"
+checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad"
 dependencies = [
 "aho-corasick",
 "memchr",
@@ -6630,7 +6749,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b91f7eff05f748767f183df4320a63d6936e9c6107d97c9e6bdd9784f4289c94"
 dependencies = [
 "base64 0.21.7",
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "serde",
 "serde_derive",
 ]
@@ -6776,9 +6895,9 @@ dependencies = [

 [[package]]
 name = "rustfft"
-version = "6.4.0"
+version = "6.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6f140db74548f7c9d7cce60912c9ac414e74df5e718dc947d514b051b42f3f4"
+checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
 dependencies = [
 "num-complex",
 "num-integer",
@@ -6790,42 +6909,29 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "0.38.44"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
-dependencies = [
- "bitflags 2.9.3",
- "errno",
- "libc",
- "linux-raw-sys 0.4.15",
- "windows-sys 0.59.0",
-]
-
-[[package]]
-name = "rustix"
-version = "1.0.8"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
+checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "errno",
 "libc",
- "linux-raw-sys 0.9.4",
- "windows-sys 0.60.2",
+ "linux-raw-sys",
+ "windows-sys 0.61.0",
 ]

 [[package]]
 name = "rustls"
-version = "0.23.31"
+version = "0.23.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc"
+checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40"
 dependencies = [
 "aws-lc-rs",
 "log",
 "once_cell",
 "ring",
 "rustls-pki-types",
- "rustls-webpki 0.103.4",
+ "rustls-webpki 0.103.6",
 "subtle",
 "zeroize",
 ]
@@ -6852,7 +6958,7 @@ dependencies = [
 "openssl-probe",
 "rustls-pki-types",
 "schannel",
- "security-framework 3.3.0",
+ "security-framework 3.5.0",
 ]

 [[package]]
@@ -6886,9 +6992,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.103.4"
+version = "0.103.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc"
+checksum = "8572f3c2cb9934231157b45499fc41e1f58c589fdfb81a844ba873265e80f8eb"
 dependencies = [
 "aws-lc-rs",
 "ring",
@@ -7026,11 +7132,11 @@ dependencies = [

 [[package]]
 name = "schannel"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
+checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.0",
 ]

 [[package]]
@@ -7124,7 +7230,7 @@ version = "2.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "core-foundation 0.9.4",
 "core-foundation-sys",
 "libc",
@@ -7133,11 +7239,11 @@ dependencies = [

 [[package]]
 name = "security-framework"
-version = "3.3.0"
+version = "3.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c"
+checksum = "cc198e42d9b7510827939c9a15f5062a0c913f3371d765977e586d2fe6c16f4a"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "core-foundation 0.10.1",
 "core-foundation-sys",
 "libc",
@@ -7146,9 +7252,9 @@ dependencies = [

 [[package]]
 name = "security-framework-sys"
-version = "2.14.0"
+version = "2.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32"
+checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0"
 dependencies = [
 "core-foundation-sys",
 "libc",
@@ -7160,7 +7266,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "cssparser",
 "derive_more 0.99.20",
 "fxhash",
@@ -7181,9 +7287,9 @@ checksum = "0f7d95a54511e0c7be3f51e8867aa8cf35148d7b9445d44de2f943e2b206e749"

 [[package]]
 name = "semver"
-version = "1.0.26"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
+checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"

 [[package]]
 name = "seq-macro"
@@ -7193,10 +7299,11 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc"

 [[package]]
 name = "serde"
-version = "1.0.219"
+version = "1.0.226"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+checksum = "0dca6411025b24b60bfa7ec1fe1f8e710ac09782dca409ee8237ba74b51295fd"
 dependencies = [
+ "serde_core",
 "serde_derive",
 ]

@@ -7211,12 +7318,13 @@ dependencies = [

 [[package]]
 name = "serde-untagged"
-version = "0.1.8"
+version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34836a629bcbc6f1afdf0907a744870039b1e14c0561cb26094fa683b158eff3"
+checksum = "f9faf48a4a2d2693be24c6289dbe26552776eb7737074e6722891fadbe6c5058"
 dependencies = [
 "erased-serde",
 "serde",
+ "serde_core",
 "typeid",
 ]

@@ -7230,11 +7338,20 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "serde_core"
+version = "1.0.226"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba2ba63999edb9dac981fb34b3e5c0d111a69b0924e253ed29d83f7c99e966a4"
+dependencies = [
+ "serde_derive",
+]
+
 [[package]]
 name = "serde_derive"
-version = "1.0.219"
+version = "1.0.226"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+checksum = "8db53ae22f34573731bafa1db20f04027b2d25e02d8205921b569171699cdb33"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -7254,15 +7371,16 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.143"
+version = "1.0.145"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a"
+checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
 dependencies = [
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "itoa",
 "memchr",
 "ryu",
 "serde",
+ "serde_core",
 ]

 [[package]]
@@ -7276,12 +7394,13 @@ dependencies = [

 [[package]]
 name = "serde_path_to_error"
-version = "0.1.17"
+version = "0.1.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59fab13f937fa393d08645bf3a84bdfe86e296747b506ada67bb15f10f218b2a"
+checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457"
 dependencies = [
 "itoa",
 "serde",
+ "serde_core",
 ]

 [[package]]
@@ -7315,11 +7434,11 @@ dependencies = [

 [[package]]
 name = "serde_spanned"
-version = "1.0.0"
+version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40734c41988f7306bb04f0ecf60ec0f3f1caa34290e4e8ea471dcd3346483b83"
+checksum = "5417783452c2be558477e104686f7de5dae53dba813c28435e0e70f82d9b04ee"
 dependencies = [
- "serde",
+ "serde_core",
 ]

 [[package]]
@@ -7336,15 +7455,15 @@ dependencies = [

 [[package]]
 name = "serde_with"
-version = "3.14.0"
+version = "3.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2c45cd61fefa9db6f254525d46e392b852e0e61d9a1fd36e5bd183450a556d5"
+checksum = "c522100790450cf78eeac1507263d0a350d4d5b30df0c8e1fe051a10c22b376e"
 dependencies = [
 "base64 0.22.1",
 "chrono",
 "hex",
 "indexmap 1.9.3",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "schemars 0.9.0",
 "schemars 1.0.4",
 "serde",
@@ -7356,11 +7475,11 @@ dependencies = [

 [[package]]
 name = "serde_with_macros"
-version = "3.14.0"
+version = "3.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de90945e6565ce0d9a25098082ed4ee4002e047cb59892c318d66821e14bb30f"
+checksum = "327ada00f7d64abaac1e55a6911e90cf665aa051b9a561c7006c157f4633135e"
 dependencies = [
- "darling 0.20.11",
+ "darling 0.21.3",
 "proc-macro2",
 "quote",
 "syn 2.0.106",
@@ -7372,7 +7491,7 @@ version = "0.9.34+deprecated"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
 dependencies = [
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "itoa",
 "ryu",
 "serde",
@@ -7510,9 +7629,9 @@ dependencies = [

 [[package]]
 name = "simba"
-version = "0.9.0"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa"
+checksum = "c99284beb21666094ba2b75bbceda012e610f5479dfcc2d6e2426f53197ffd95"
 dependencies = [
 "approx",
 "num-complex",
@@ -7661,9 +7780,9 @@ dependencies = [

 [[package]]
 name = "stop-words"
-version = "0.8.1"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c6a86be9f7fa4559b7339669e72026eb437f5e9c5a85c207fe1033079033a17"
+checksum = "645a3d441ccf4bf47f2e4b7681461986681a6eeea9937d4c3bc9febd61d17c71"
 dependencies = [
 "serde_json",
 ]
@@ -7927,7 +8046,7 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "byteorder",
 "enum-as-inner",
 "libc",
@@ -7941,7 +8060,7 @@ version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01198a2debb237c62b6826ec7081082d951f46dbb64b0e8c7649a452230d1dfc"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "byteorder",
 "enum-as-inner",
 "libc",
@@ -7981,7 +8100,7 @@ version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "core-foundation 0.9.4",
 "system-configuration-sys 0.6.0",
 ]
@@ -8037,15 +8156,15 @@ dependencies = [

 [[package]]
 name = "tempfile"
-version = "3.21.0"
+version = "3.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
+checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
 dependencies = [
 "fastrand",
 "getrandom 0.3.3",
 "once_cell",
- "rustix 1.0.8",
- "windows-sys 0.60.2",
+ "rustix",
+ "windows-sys 0.61.0",
 ]

 [[package]]
@@ -8065,7 +8184,7 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0"
 dependencies = [
- "rustix 1.0.8",
+ "rustix",
 "windows-sys 0.60.2",
 ]

@@ -8129,20 +8248,23 @@ dependencies = [

 [[package]]
 name = "tiff"
-version = "0.9.1"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e"
+checksum = "af9605de7fee8d9551863fd692cce7637f548dbd9db9180fcc07ccc6d26c336f"
 dependencies = [
+ "fax",
 "flate2",
- "jpeg-decoder",
+ "half 2.6.0",
+ "quick-error 2.0.1",
 "weezl",
+ "zune-jpeg",
 ]

 [[package]]
 name = "time"
-version = "0.3.41"
+version = "0.3.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40"
+checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
 dependencies = [
 "deranged",
 "itoa",
@@ -8157,15 +8279,15 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.4"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c"
+checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b"

 [[package]]
 name = "time-macros"
-version = "0.2.22"
+version = "0.2.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49"
+checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3"
 dependencies = [
 "num-conv",
 "time-core",
@@ -8317,9 +8439,9 @@ dependencies = [

 [[package]]
 name = "tokio-rustls"
-version = "0.26.2"
+version = "0.26.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
+checksum = "05f63835928ca123f1bef57abbcd23bb2ba0ac9ae1235f1e65bda0d06e7786bd"
 dependencies = [
 "rustls",
 "tokio",
@@ -8470,18 +8592,18 @@ dependencies = [
 "serde",
 "serde_spanned 0.6.9",
 "toml_datetime 0.6.11",
- "toml_edit",
+ "toml_edit 0.22.27",
 ]

 [[package]]
 name = "toml"
-version = "0.9.5"
+version = "0.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75129e1dc5000bfbaa9fee9d1b21f974f9fbad9daec557a521ee6e080825f6e8"
+checksum = "00e5e5d9bf2475ac9d4f0d9edab68cc573dc2fd644b0dba36b0c30a92dd9eaa0"
 dependencies = [
- "serde",
- "serde_spanned 1.0.0",
- "toml_datetime 0.7.0",
+ "serde_core",
+ "serde_spanned 1.0.2",
+ "toml_datetime 0.7.2",
 "toml_parser",
 "winnow",
 ]
@@ -8497,11 +8619,11 @@ dependencies = [

 [[package]]
 name = "toml_datetime"
-version = "0.7.0"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bade1c3e902f58d73d3f294cd7f20391c1cb2fbcb643b73566bc773971df91e3"
+checksum = "32f1085dec27c2b6632b04c80b3bb1b4300d6495d1e129693bdda7d91e72eec1"
 dependencies = [
- "serde",
+ "serde_core",
 ]

 [[package]]
@@ -8510,7 +8632,7 @@ version = "0.22.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
 dependencies = [
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "serde",
 "serde_spanned 0.6.9",
 "toml_datetime 0.6.11",
@@ -8518,6 +8640,18 @@ dependencies = [
 "winnow",
 ]

+[[package]]
+name = "toml_edit"
+version = "0.23.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3effe7c0e86fdff4f69cdd2ccc1b96f933e24811c5441d44904e8683e27184b"
+dependencies = [
+ "indexmap 2.11.4",
+ "toml_datetime 0.7.2",
+ "toml_parser",
+ "winnow",
+]
+
 [[package]]
 name = "toml_parser"
 version = "1.0.3"
@@ -8635,7 +8769,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
 dependencies = [
 "futures-core",
 "futures-util",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "pin-project-lite",
 "slab",
 "sync_wrapper 1.0.2",
@@ -8652,7 +8786,7 @@ version = "0.6.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2"
 dependencies = [
- "bitflags 2.9.3",
+ "bitflags 2.9.4",
 "bytes",
 "futures-util",
 "http 1.3.1",
@@ -8968,15 +9102,15 @@ checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539"

 [[package]]
 name = "unicode-general-category"
-version = "1.0.0"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24adfe8311434967077a6adff125729161e6e4934d76f6b7c55318ac5c9246d3"
+checksum = "0b993bddc193ae5bd0d623b49ec06ac3e9312875fdae725a975c51db1cc1677f"

 [[package]]
 name = "unicode-ident"
-version = "1.0.18"
+version = "1.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d"

 [[package]]
 name = "unicode-normalization-alignments"
@@ -9112,7 +9246,7 @@ version = "5.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2fcc29c80c21c31608227e0912b2d7fddba57ad76b606890627ba8ee7964e993"
 dependencies = [
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "serde",
 "serde_json",
 "utoipa-gen",
@@ -9140,9 +9274,9 @@ dependencies = [

 [[package]]
 name = "uuid"
-version = "1.18.0"
+version = "1.18.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f33196643e165781c20a5ead5582283a7dacbb87855d867fbc2df3f81eddc1be"
+checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2"
 dependencies = [
 "getrandom 0.3.3",
 "js-sys",
@@ -9312,30 +9446,40 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"

 [[package]]
 name = "wasi"
-version = "0.14.3+wasi-0.2.4"
+version = "0.14.7+wasi-0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95"
+checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c"
+dependencies = [
+ "wasip2",
+]
+
+[[package]]
+name = "wasip2"
+version = "1.0.1+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7"
 dependencies = [
 "wit-bindgen",
 ]

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
+checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d"
 dependencies = [
 "cfg-if 1.0.3",
 "once_cell",
 "rustversion",
 "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
 ]

 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
+checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19"
 dependencies = [
 "bumpalo",
 "log",
@@ -9347,9 +9491,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.50"
+version = "0.4.54"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
+checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c"
 dependencies = [
 "cfg-if 1.0.3",
 "js-sys",
@@ -9360,9 +9504,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
+checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -9370,9 +9514,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
+checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -9383,9 +9527,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.100"
+version = "0.2.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1"
 dependencies = [
 "unicode-ident",
 ]
@@ -9405,9 +9549,9 @@ dependencies = [

 [[package]]
 name = "web-sys"
-version = "0.3.77"
+version = "0.3.81"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
@@ -9459,18 +9603,6 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3"

-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix 0.38.44",
-]
-
 [[package]]
 name = "wide"
 version = "0.7.33"
@@ -9517,11 +9649,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"

 [[package]]
 name = "winapi-util"
-version = "0.1.10"
+version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.60.2",
+ "windows-sys 0.61.0",
 ]

 [[package]]
@@ -9551,15 +9683,15 @@ dependencies = [

 [[package]]
 name = "windows-core"
-version = "0.61.2"
+version = "0.62.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
+checksum = "57fe7168f7de578d2d8a05b07fd61870d2e73b4020e9f49aa00da8471723497c"
 dependencies = [
 "windows-implement",
 "windows-interface",
- "windows-link",
- "windows-result",
- "windows-strings",
+ "windows-link 0.2.0",
+ "windows-result 0.4.0",
+ "windows-strings 0.5.0",
 ]

 [[package]]
@@ -9590,15 +9722,21 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"

+[[package]]
+name = "windows-link"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65"
+
 [[package]]
 name = "windows-registry"
 version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e"
 dependencies = [
- "windows-link",
- "windows-result",
- "windows-strings",
+ "windows-link 0.1.3",
+ "windows-result 0.3.4",
+ "windows-strings 0.4.2",
 ]

 [[package]]
@@ -9607,7 +9745,16 @@ version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
 dependencies = [
- "windows-link",
+ "windows-link 0.1.3",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f"
+dependencies = [
+ "windows-link 0.2.0",
 ]

 [[package]]
@@ -9616,7 +9763,16 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
 dependencies = [
- "windows-link",
+ "windows-link 0.1.3",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda"
+dependencies = [
+ "windows-link 0.2.0",
 ]

 [[package]]
@@ -9655,6 +9811,15 @@ dependencies = [
 "windows-targets 0.53.3",
 ]

+[[package]]
+name = "windows-sys"
+version = "0.61.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa"
+dependencies = [
+ "windows-link 0.2.0",
+]
+
 [[package]]
 name = "windows-targets"
 version = "0.48.5"
@@ -9692,7 +9857,7 @@ version = "0.53.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
 dependencies = [
- "windows-link",
+ "windows-link 0.1.3",
 "windows_aarch64_gnullvm 0.53.0",
 "windows_aarch64_msvc 0.53.0",
 "windows_i686_gnu 0.53.0",
@@ -9862,9 +10027,9 @@ dependencies = [

 [[package]]
 name = "wit-bindgen"
-version = "0.45.0"
+version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814"
+checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"

 [[package]]
 name = "writeable"
@@ -9955,18 +10120,18 @@ dependencies = [

 [[package]]
 name = "zerocopy"
-version = "0.8.26"
+version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
+checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
 dependencies = [
 "zerocopy-derive",
 ]

 [[package]]
 name = "zerocopy-derive"
-version = "0.8.26"
+version = "0.8.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
+checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -10024,7 +10189,7 @@ dependencies = [
 "thiserror 1.0.69",
 "tokio",
 "tokio-util",
- "uuid 1.18.0",
+ "uuid 1.18.1",
 ]

 [[package]]
@@ -10080,7 +10245,7 @@ dependencies = [
 "crc32fast",
 "crossbeam-utils",
 "displaydoc",
- "indexmap 2.11.0",
+ "indexmap 2.11.4",
 "num_enum",
 "thiserror 1.0.69",
 ]
@@ -10124,9 +10289,9 @@ dependencies = [

 [[package]]
 name = "zune-jpeg"
-version = "0.4.20"
+version = "0.4.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc1f7e205ce79eb2da3cd71c5f55f3589785cb7c79f6a03d1c8d1491bda5d089"
+checksum = "29ce2c8a9384ad323cf564b67da86e21d3cfdff87908bc1223ed5c99bc792713"
 dependencies = [
 "zune-core",
 ]
--- a/lib/runtime/Cargo.toml
+++ b/lib/runtime/Cargo.toml
@@ -17,6 +17,7 @@ default = []
 integration = []
 testing-etcd = [] # Tests that require an active ETCD server
 tokio-console = ["dep:console-subscriber", "tokio/tracing"]
+compute-validation = [] # Enable validation and timing for compute macros

 [dependencies]
 # Use workspace dependencies where available
@@ -63,11 +64,14 @@ nid = { version = "3.0.0", features = ["serde"] }
 nix = { version = "0.29", features = ["signal"] }
 nuid = { version = "0.5" }
 once_cell = { version = "1" }
+rayon = { version = "1.10" }
 regex = { version = "1" }
 socket2 = { version = "0.5.8" }
+tokio-rayon = { version = "2.1" }

 [dev-dependencies]
 assert_matches = { version = "1.5.0" }
+criterion = { version = "0.5", features = ["async_tokio"] }
 env_logger = { version = "0.11" }
 reqwest = { workspace = true }
 rstest = { version = "0.23.0" }
@@ -75,3 +79,7 @@ temp-env = { version = "0.3.6" , features=["async_closure"] }
 stdio-override = {version= "0.2.0"}
 jsonschema = {version = "0.17"}
 tempfile = { workspace = true }
+
+[[bench]]
+name = "compute_pool_overhead"
+harness = false
\ No newline at end of file
--- a/lib/runtime/benches/compute_pool_overhead.rs
+++ b/lib/runtime/benches/compute_pool_overhead.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use criterion::{BenchmarkId, Criterion, black_box, criterion_group, criterion_main};
+use dynamo_runtime::compute::ComputePool;
+use std::sync::Arc;
+
+/// Compute-intensive function: sum of all primes up to n
+fn compute_primes_sum(n: u64) -> u64 {
+    let mut sum = 0u64;
+    for candidate in 2..=n {
+        if is_prime(candidate) {
+            sum += candidate;
+        }
+    }
+    sum
+}
+
+fn is_prime(n: u64) -> bool {
+    if n <= 1 {
+        return false;
+    }
+    if n <= 3 {
+        return true;
+    }
+    if n.is_multiple_of(2) || n.is_multiple_of(3) {
+        return false;
+    }
+
+    let sqrt_n = (n as f64).sqrt() as u64;
+    for i in (5..=sqrt_n).step_by(6) {
+        if n.is_multiple_of(i) || n.is_multiple_of(i + 2) {
+            return false;
+        }
+    }
+    true
+}
+
+fn bench_compute_overhead(c: &mut Criterion) {
+    // Test 3 representative sizes: small, medium, large
+    let test_sizes = [10, 1_000, 100_000];
+
+    let mut group = c.benchmark_group("compute_overhead");
+    group.sample_size(10); // Reduce sample size for longer benchmarks
+
+    // Setup runtimes
+    let tokio_4thread = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(4)
+        .max_blocking_threads(1)
+        .enable_all()
+        .build()
+        .unwrap();
+    let tokio_1thread = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(1)
+        .max_blocking_threads(1)
+        .enable_all()
+        .build()
+        .unwrap();
+
+    // Setup compute pool
+    let compute_config = dynamo_runtime::compute::ComputeConfig {
+        num_threads: Some(4),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "bench".to_string(),
+        pin_threads: false,
+    };
+    let compute_pool = Arc::new(ComputePool::new(compute_config).unwrap());
+
+    for n in test_sizes {
+        // Benchmark 1: Direct execution on Tokio (4 threads)
+        group.bench_with_input(BenchmarkId::new("tokio_direct", n), &n, |b, &n| {
+            b.to_async(&tokio_4thread)
+                .iter(|| async move { black_box(compute_primes_sum(black_box(n))) });
+        });
+
+        // Benchmark 2: Rayon offload (1 Tokio thread + 4 Rayon threads)
+        let pool = compute_pool.clone();
+        group.bench_with_input(BenchmarkId::new("rayon_offload", n), &n, |b, &n| {
+            b.to_async(&tokio_1thread).iter(|| {
+                let pool = pool.clone();
+                async move {
+                    pool.execute(move || black_box(compute_primes_sum(black_box(n))))
+                        .await
+                        .unwrap()
+                }
+            });
+        });
+
+        // Benchmark 3: spawn_blocking (4 Tokio threads)
+        group.bench_with_input(BenchmarkId::new("spawn_blocking", n), &n, |b, &n| {
+            b.to_async(&tokio_4thread).iter(|| async move {
+                tokio::task::spawn_blocking(move || black_box(compute_primes_sum(black_box(n))))
+                    .await
+                    .unwrap()
+            });
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_parallel_tasks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("parallel_tasks");
+    group.sample_size(10); // Even smaller sample for parallel benchmarks
+
+    let tokio_runtime = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(4)
+        .max_blocking_threads(1)
+        .enable_all()
+        .build()
+        .unwrap();
+    let compute_config = dynamo_runtime::compute::ComputeConfig {
+        num_threads: Some(4),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "bench".to_string(),
+        pin_threads: false,
+    };
+    let compute_pool = Arc::new(ComputePool::new(compute_config).unwrap());
+
+    // Test with different batch sizes
+    for batch_size in [10, 100] {
+        let n = 10_000; // Fixed compute size
+
+        // Direct parallel execution on Tokio threads
+        group.bench_with_input(
+            BenchmarkId::new("tokio_direct_parallel", batch_size),
+            &batch_size,
+            |b, &batch_size| {
+                b.to_async(&tokio_runtime).iter(|| async move {
+                    let tasks = (0..batch_size)
+                        .map(|_| tokio::spawn(async move { compute_primes_sum(n) }))
+                        .collect::<Vec<_>>();
+
+                    for task in tasks {
+                        black_box(task.await.unwrap());
+                    }
+                });
+            },
+        );
+
+        // Parallel execution with Rayon
+        let pool = compute_pool.clone();
+        group.bench_with_input(
+            BenchmarkId::new("rayon_parallel", batch_size),
+            &batch_size,
+            |b, &batch_size| {
+                b.to_async(&tokio_runtime).iter(|| {
+                    let pool = pool.clone();
+                    async move {
+                        let tasks = (0..batch_size)
+                            .map(|_| {
+                                let pool = pool.clone();
+                                tokio::spawn(async move {
+                                    pool.execute(move || compute_primes_sum(n)).await.unwrap()
+                                })
+                            })
+                            .collect::<Vec<_>>();
+
+                        for task in tasks {
+                            black_box(task.await.unwrap());
+                        }
+                    }
+                });
+            },
+        );
+
+        // Parallel execution with spawn_blocking
+        group.bench_with_input(
+            BenchmarkId::new("spawn_blocking_parallel", batch_size),
+            &batch_size,
+            |b, &batch_size| {
+                b.to_async(&tokio_runtime).iter(|| async move {
+                    let tasks = (0..batch_size)
+                        .map(|_| {
+                            tokio::spawn(async move {
+                                tokio::task::spawn_blocking(move || compute_primes_sum(n))
+                                    .await
+                                    .unwrap()
+                            })
+                        })
+                        .collect::<Vec<_>>();
+
+                    for task in tasks {
+                        black_box(task.await.unwrap());
+                    }
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_block_in_place_overhead(c: &mut Criterion) {
+    // Test block_in_place overhead for medium-sized tasks
+    let test_sizes = [10, 1_000, 100_000];
+
+    let mut group = c.benchmark_group("block_in_place_overhead");
+    group.sample_size(10);
+
+    // Setup 4-thread runtime for testing
+    let tokio_runtime = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(4)
+        .max_blocking_threads(1)
+        .enable_all()
+        .build()
+        .unwrap();
+
+    // Setup compute pool for comparison
+    let compute_config = dynamo_runtime::compute::ComputeConfig {
+        num_threads: Some(4),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "bench".to_string(),
+        pin_threads: false,
+    };
+    let compute_pool = Arc::new(ComputePool::new(compute_config).unwrap());
+
+    for n in test_sizes {
+        // Benchmark 1: Direct execution (baseline)
+        group.bench_with_input(BenchmarkId::new("direct", n), &n, |b, &n| {
+            b.to_async(&tokio_runtime)
+                .iter(|| async move { black_box(compute_primes_sum(black_box(n))) });
+        });
+
+        // Benchmark 2: block_in_place (no semaphore)
+        group.bench_with_input(BenchmarkId::new("block_in_place", n), &n, |b, &n| {
+            b.to_async(&tokio_runtime).iter(|| async move {
+                tokio::task::block_in_place(|| black_box(compute_primes_sum(black_box(n))))
+            });
+        });
+
+        // Benchmark 3: spawn_blocking
+        group.bench_with_input(BenchmarkId::new("spawn_blocking", n), &n, |b, &n| {
+            b.to_async(&tokio_runtime).iter(|| async move {
+                tokio::task::spawn_blocking(move || black_box(compute_primes_sum(black_box(n))))
+                    .await
+                    .unwrap()
+            });
+        });
+
+        // Benchmark 4: Rayon offload
+        let pool = compute_pool.clone();
+        group.bench_with_input(BenchmarkId::new("rayon_offload", n), &n, |b, &n| {
+            b.to_async(&tokio_runtime).iter(|| {
+                let pool = pool.clone();
+                async move {
+                    pool.execute(move || black_box(compute_primes_sum(black_box(n))))
+                        .await
+                        .unwrap()
+                }
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_compute_overhead,
+    bench_parallel_tasks,
+    bench_block_in_place_overhead
+);
+criterion_main!(benches);
--- a/lib/runtime/docs/rayon-tokio-strategy.md
+++ b/lib/runtime/docs/rayon-tokio-strategy.md
+# Rayon-Tokio Integration Strategy
+
+## Overview
+
+This document describes the integration strategy for combining Tokio's asynchronous runtime with Rayon's data-parallel compute capabilities in the Dynamo runtime. The core philosophy is simple:
+
+- **Tokio** handles I/O-bound operations, waiting, and coordination
+- **Rayon** handles CPU-bound operations and data parallelism
+- Multiple async tasks can concurrently submit different types of work to the shared Rayon thread pool
+
+## Architecture
+
+```text
+---------------------------------------------------------------+
+|                     Tokio Runtime                             |
+|  +-----------+    +-----------+    +-----------+             |
+|  | Async     |    | Async     |    | Async     |             |
+|  | Task 1    |    | Task 2    |    | Task 3    |             |
+|  |           |    |           |    |           |             |
+|  | Receives  |    | Processes |    | Handles   |             |
+|  | requests  |    | streams   |    | batches   |             |
+|  +-----+-----+    +-----+-----+    +-----+-----+             |
+|        |                |                |                   |
+|        +----------------+----------------+                   |
+|                         |                                    |
+|                  tokio_rayon::spawn                          |
+|                         |                                    |
+-------------------------+------------------------------------+
+                          |
+                          v
+---------------------------------------------------------------+
+|                    Rayon Thread Pool                         |
+|                                                               |
+|  +----------------------------------------------------------+ |
+|  |         Work-Stealing Thread Pool (N threads)           | |
+|  |                                                          | |
+|  |  +---------+  +-----------+  +------------------+       | |
+|  |  | scope() |  | par_iter()|  | join()           |       | |
+|  |  | tasks   |  | chunks    |  | computations     |       | |
+|  |  +---------+  +-----------+  +------------------+       | |
+|  |                                                          | |
+|  |  All patterns share the same thread pool                | |
+|  +----------------------------------------------------------+ |
+---------------------------------------------------------------+
+```
+
+## When to Use Tokio vs Rayon
+
+### Use Tokio (async/await) when
+- **Waiting for I/O**: Network requests, file I/O, database queries
+- **Coordinating tasks**: Channels, synchronization, signaling
+- **Stream processing**: Items arrive over time with delays
+- **Resource pooling**: Connection pools, async locks
+- **Service orchestration**: Managing component lifecycles
+
+### Use Rayon (compute pool) when
+- **Batch processing**: You have all data ready for parallel processing
+- **CPU-intensive work**: Computation takes >1ms per item
+- **Data transformation**: Tokenization, serialization, compression
+- **Parallel algorithms**: Matrix operations, sorting, searching
+- **Map-reduce patterns**: Aggregations over large datasets
+
+### Decision Thresholds
+- Use Rayon when processing **≥10 items** in parallel
+- Use `spawn_blocking` when CPU work takes **>1ms**
+- Keep Tokio for operations with **>100μs waits** between items
+- Use Rayon when you can **saturate multiple CPU cores**
+
+### Overhead Considerations
+Based on benchmarks, the async bridge between Tokio and Rayon has:
+- **~25μs overhead** for small tasks (due to channel communication)
+- **~4% overhead** for tasks taking >2ms
+- **Negligible overhead** for tasks taking >10ms
+
+For minimal overhead when using Rayon from async context:
+- **Small tasks (<100μs)**: Run directly on Tokio
+- **Medium tasks (100μs-1ms)**: Use `spawn_blocking` + `pool.execute_sync()`
+- **Large tasks (>1ms)**: Use `pool.execute()` for convenience
+
+## Concurrent Usage Patterns
+
+The key insight is that multiple async tasks can concurrently use the same Rayon thread pool with different parallelization patterns. Rayon's work-stealing scheduler efficiently distributes work regardless of the pattern used.
+
+### Pattern 1: Concurrent Scope and ParIter
+
+```rust,ignore
+use std::sync::Arc;
+use dynamo_runtime::compute::ComputePool;
+
+async fn concurrent_compute_tasks(pool: Arc<ComputePool>) {
+    // Task 1: Using scope for dynamic task spawning
+    let task1 = tokio::spawn({
+        let pool = pool.clone();
+        async move {
+            pool.execute_scoped(|scope| {
+                // Dynamically spawn tasks based on runtime conditions
+                for i in 0..num_tasks {
+                    scope.spawn(move |_| {
+                        expensive_computation(i)
+                    });
+                }
+            }).await
+        }
+    });
+
+    // Task 2: Using parallel iterators for batch processing
+    let task2 = tokio::spawn({
+        let pool = pool.clone();
+        async move {
+            pool.install(|| {
+                // Process data in parallel chunks
+                data.par_chunks(100)
+                    .map(|chunk| transform_chunk(chunk))
+                    .collect::<Vec<_>>()
+            }).await
+        }
+    });
+
+    // Task 3: Using join for binary parallelism
+    let task3 = tokio::spawn({
+        let pool = pool.clone();
+        async move {
+            pool.join(
+                || compute_left_branch(),
+                || compute_right_branch(),
+            ).await
+        }
+    });
+
+    // All three tasks run concurrently, sharing the Rayon thread pool
+    let (r1, r2, r3) = tokio::join!(task1, task2, task3);
+}
+```
+
+### Pattern 2: Stream Processing with Batch Compute
+
+```rust,no_run
+# use futures::StreamExt;
+# use rayon::prelude::*;
+# use std::sync::Arc;
+# struct Data;
+# fn process_item(_: &Data) -> i32 { 0 }
+# async fn send_results(_: Vec<i32>) {}
+# use dynamo_runtime::compute::ComputePool;
+# use futures::stream::Stream;
+
+/// Example: Process async stream with CPU-intensive batch operations
+async fn stream_with_compute(
+    pool: Arc<ComputePool>,
+    stream: impl Stream<Item = Vec<Data>>,
+) {
+    // Use for_each_concurrent for proper stream consumption
+    stream.for_each_concurrent(4, |batch| {
+        let pool = pool.clone();
+        async move {
+            // Process batch using parallel iterators
+            let result = pool.install(move || {
+                batch.par_iter()
+                    .map(|item| process_item(item))
+                    .collect::<Vec<_>>()
+            }).await.unwrap();
+
+            // Async I/O to send results
+            send_results(result).await;
+        }
+    }).await;
+}
+```
+
+### Pattern 3: Mixed Workload Service
+
+```rust,ignore
+/// Real-world example: LLM service with mixed workloads
+struct LLMService {
+    runtime: Arc<Runtime>,
+    tokenizer: Arc<Tokenizer>,
+}
+
+impl LLMService {
+    async fn run(&self) {
+        let pool = self.runtime.compute_pool()
+            .expect("Compute pool required");
+
+        // Tokenization service - uses parallel iterators
+        let tokenization_task = {
+            let pool = pool.clone();
+            let tokenizer = self.tokenizer.clone();
+            tokio::spawn(async move {
+                loop {
+                    // Async I/O: receive batch from network
+                    let texts = receive_tokenization_batch().await;
+
+                    // CPU-bound: parallel tokenization
+                    let tokens = pool.install(move || {
+                        texts.par_iter()
+                            .map(|text| tokenizer.encode(text))
+                            .collect::<Vec<_>>()
+                    }).await.unwrap();
+
+                    // Async I/O: send results
+                    send_tokens(tokens).await;
+                }
+            })
+        };
+
+        // Embedding service - uses scope for multi-stage computation
+        let embedding_task = {
+            let pool = pool.clone();
+            tokio::spawn(async move {
+                loop {
+                    // Async I/O: receive request
+                    let request = receive_embedding_request().await;
+
+                    // CPU-bound: multi-stage parallel computation
+                    let embeddings = pool.execute_scoped(|scope| {
+                        let mut text_emb = None;
+                        let mut context_emb = None;
+
+                        scope.spawn(|_| {
+                            text_emb = Some(compute_text_embedding(&request.text));
+                        });
+
+                        scope.spawn(|_| {
+                            context_emb = Some(compute_context_embedding(&request.context));
+                        });
+
+                        // Scope waits for both to complete
+                        combine_embeddings(text_emb.unwrap(), context_emb.unwrap())
+                    }).await.unwrap();
+
+                    // Async I/O: send results
+                    send_embeddings(embeddings).await;
+                }
+            })
+        };
+
+        // Batch inference service - uses nested parallelism
+        let inference_task = {
+            let pool = pool.clone();
+            tokio::spawn(async move {
+                loop {
+                    let batch = receive_inference_batch().await;
+
+                    let results = pool.execute_scoped(|scope| {
+                        let mut results = Vec::with_capacity(batch.len());
+
+                        // Spawn a task for each item
+                        for item in batch {
+                            scope.spawn(move |s2| {
+                                // Within each task, use parallel iterators
+                                let preprocessed = item.data
+                                    .par_chunks(10)
+                                    .map(|chunk| preprocess(chunk))
+                                    .collect::<Vec<_>>();
+
+                                // Can spawn more tasks within nested scope
+                                let mut stages = vec![];
+                                for p in preprocessed {
+                                    s2.spawn(move |_| {
+                                        stages.push(run_inference(p));
+                                    });
+                                }
+
+                                results.push(merge_stages(stages));
+                            });
+                        }
+
+                        results
+                    }).await.unwrap();
+
+                    send_inference_results(results).await;
+                }
+            })
+        };
+
+        // All services run concurrently, sharing the compute pool
+        tokio::join!(tokenization_task, embedding_task, inference_task);
+    }
+}
+```
+
+## How It Works: Thread Pool Sharing
+
+Rayon's work-stealing scheduler ensures efficient resource utilization even when different async tasks submit different types of work:
+
+1. **Work Queues**: Each Rayon thread has a local deque (double-ended queue)
+2. **Local Execution**: Threads prefer executing their own tasks (LIFO for cache locality)
+3. **Work Stealing**: Idle threads steal tasks from busy threads (FIFO from the other end)
+4. **No Interference**: Different parallelization patterns (scope, par_iter) coexist peacefully
+
+This means:
+- A `scope` task spawning many small tasks works alongside `par_chunks` processing large batches
+- The thread pool automatically balances load between different types of work
+- No manual coordination needed between different async tasks using the pool
+
+## Performance Considerations
+
+### Thread Pool Sizing
+
+```toml
+[runtime]
+# Tokio threads: optimize for concurrent async tasks
+num_worker_threads = 8  # Usually number of cores
+
+# Rayon threads: optimize for CPU saturation
+compute_threads = 4     # Often cores/2 to avoid oversubscription
+```
+
+### Avoiding Oversubscription
+
+Total threads = Tokio workers + Rayon threads + System threads
+
+**Recommendation**: Keep total ≤ 1.5 × physical cores
+
+### Monitoring Pool Utilization
+
+```rust,ignore
+// Check pool metrics
+let metrics = pool.metrics();
+println!("Active tasks: {}", metrics.tasks_active());
+println!("Average duration: {:.2}ms", metrics.avg_task_duration_us() / 1000.0);
+println!("Slow tasks (>100ms): {}", metrics.slow_tasks());
+
+// Adjust pool size if consistently over/under utilized
+if metrics.tasks_active() > pool.num_threads() * 2 {
+    // Consider increasing compute_threads
+}
+```
+
+## Common Patterns and Best Practices
+
+### DO: Batch Collection Before Processing
+
+```rust,ignore
+// ✅ Good: Collect async items, then process in parallel
+let items = stream.take(100).collect::<Vec<_>>().await;
+let processed = pool.install(|| {
+    items.par_iter().map(|item| process(item)).collect()
+}).await?;
+```
+
+### DON'T: Mix Async and Compute in Tight Loops
+
+```rust,ignore
+// ❌ Bad: Alternating between async and compute
+for item in items {
+    let data = fetch_data(item).await;  // Async
+    let result = pool.execute(|| compute(data)).await?;  // Compute
+    store_result(result).await;  // Async
+}
+
+// ✅ Good: Batch operations
+let all_data = futures::future::join_all(
+    items.iter().map(|item| fetch_data(item))
+).await;
+
+let all_results = pool.install(|| {
+    all_data.par_iter().map(|data| compute(data)).collect()
+}).await?;
+
+futures::future::join_all(
+    all_results.iter().map(|result| store_result(result))
+).await;
+```
+
+### DO: Use Scope for Dynamic Parallelism
+
+```rust,ignore
+// ✅ Good: When you don't know the parallelism level upfront
+pool.execute_scoped(|scope| {
+    while let Some(work) = find_more_work() {
+        scope.spawn(move |_| {
+            process_work(work);
+        });
+    }
+}).await?;
+```
+
+### DO: Use ParIter for Data Parallelism
+
+```rust,ignore
+// ✅ Good: When processing collections
+pool.install(|| {
+    data.par_chunks(optimal_chunk_size())
+        .map(|chunk| process_chunk(chunk))
+        .reduce(|| initial_value(), |a, b| combine(a, b))
+}).await?;
+```
+
+## Troubleshooting
+
+### Issue: High Latency Despite Low CPU Usage
+**Cause**: Too few Rayon threads for the workload
+**Solution**: Increase `compute_threads` configuration
+
+### Issue: System Feels Sluggish
+**Cause**: Thread oversubscription
+**Solution**: Reduce total thread count (Tokio + Rayon)
+
+### Issue: Uneven Work Distribution
+**Cause**: Poor chunk size selection
+**Solution**: Use smaller chunks or dynamic scheduling with `scope`
+
+### Issue: Deadlock or Hanging
+**Cause**: Nested `install()` calls or blocking in Rayon threads
+**Solution**: Use `execute()` instead of `install()` for simple tasks
+
+## Configuration Examples
+
+### High-Throughput Service
+```toml
+# Many concurrent requests, moderate compute per request
+[runtime]
+num_worker_threads = 16
+compute_threads = 8
+compute_stack_size = "4MB"
+```
+
+### Batch Processing System
+```toml
+# Few concurrent tasks, heavy compute per batch
+[runtime]
+num_worker_threads = 4
+compute_threads = 12
+compute_stack_size = "8MB"
+```
+
+### Mixed Workload
+```toml
+# Balance between async I/O and compute
+[runtime]
+num_worker_threads = 8
+compute_threads = 6
+compute_stack_size = "2MB"
+```
+
+## Summary
+
+The Rayon-Tokio integration provides a powerful model for handling mixed workloads:
+
+1. **Tokio** manages async I/O and coordination
+2. **Rayon** provides a shared compute thread pool
+3. Multiple async tasks can concurrently use different Rayon patterns
+4. Work-stealing ensures efficient resource utilization
+5. Clear separation between I/O-bound and CPU-bound work
+
+This architecture enables building high-performance services that efficiently handle both network I/O and CPU-intensive computations without manual thread management or complex synchronization.
\ No newline at end of file
--- a/lib/runtime/examples/async_throughput_demo.rs
+++ b/lib/runtime/examples/async_throughput_demo.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use anyhow::Result;
+use dynamo_runtime::compute::{ComputeConfig, ComputePool};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::{Duration, Instant};
+use tokio::time::sleep;
+
+/// Compute-intensive function: sum of all primes up to n
+fn compute_primes_sum(n: u64) -> u64 {
+    let mut sum = 0u64;
+    for candidate in 2..=n {
+        if is_prime(candidate) {
+            sum += candidate;
+        }
+    }
+    sum
+}
+
+fn is_prime(n: u64) -> bool {
+    if n <= 1 {
+        return false;
+    }
+    if n <= 3 {
+        return true;
+    }
+    if n.is_multiple_of(2) || n.is_multiple_of(3) {
+        return false;
+    }
+
+    let sqrt_n = (n as f64).sqrt() as u64;
+    for i in (5..=sqrt_n).step_by(6) {
+        if n.is_multiple_of(i) || n.is_multiple_of(i + 2) {
+            return false;
+        }
+    }
+    true
+}
+
+/// Simulated async task that does both I/O and compute
+async fn async_task_inline(id: usize, n: u64, io_delay: Duration) -> (usize, Duration) {
+    let start = Instant::now();
+
+    // Simulate async I/O operation
+    sleep(io_delay).await;
+
+    // CPU-intensive work (blocks the async runtime!)
+    let _result = compute_primes_sum(n);
+
+    // More async I/O
+    sleep(io_delay).await;
+
+    (id, start.elapsed())
+}
+
+/// Async task that offloads compute to Rayon pool
+async fn async_task_rayon(
+    id: usize,
+    n: u64,
+    io_delay: Duration,
+    pool: Arc<ComputePool>,
+) -> (usize, Duration) {
+    let start = Instant::now();
+
+    // Simulate async I/O operation
+    sleep(io_delay).await;
+
+    // CPU-intensive work (offloaded, doesn't block runtime)
+    let _result = pool.execute(move || compute_primes_sum(n)).await.unwrap();
+
+    // More async I/O
+    sleep(io_delay).await;
+
+    (id, start.elapsed())
+}
+
+/// Async task using spawn_blocking
+async fn async_task_spawn_blocking(id: usize, n: u64, io_delay: Duration) -> (usize, Duration) {
+    let start = Instant::now();
+
+    // Simulate async I/O operation
+    sleep(io_delay).await;
+
+    // CPU-intensive work (offloaded to blocking pool)
+    let _result = tokio::task::spawn_blocking(move || compute_primes_sum(n))
+        .await
+        .unwrap();
+
+    // More async I/O
+    sleep(io_delay).await;
+
+    (id, start.elapsed())
+}
+
+async fn run_throughput_test(
+    name: &str,
+    num_tasks: usize,
+    n: u64,
+    io_delay: Duration,
+    pool: Option<Arc<ComputePool>>,
+    mode: &str,
+) -> (Duration, Vec<Duration>) {
+    println!("\n Running: {} (n={}, tasks={})", name, n, num_tasks);
+
+    let completed = Arc::new(AtomicU64::new(0));
+    let start = Instant::now();
+
+    let tasks: Vec<_> = (0..num_tasks)
+        .map(|id| {
+            let pool = pool.clone();
+            let completed = completed.clone();
+            let mode = mode.to_string();
+
+            tokio::spawn(async move {
+                let result = match mode.as_str() {
+                    "inline" => async_task_inline(id, n, io_delay).await,
+                    "rayon" => async_task_rayon(id, n, io_delay, pool.unwrap()).await,
+                    "spawn_blocking" => async_task_spawn_blocking(id, n, io_delay).await,
+                    _ => panic!("Unknown mode"),
+                };
+
+                let count = completed.fetch_add(1, Ordering::Relaxed) + 1;
+                if count.is_multiple_of(10) {
+                    print!(".");
+                    use std::io::{self, Write};
+                    io::stdout().flush().unwrap();
+                }
+
+                result
+            })
+        })
+        .collect();
+
+    let mut latencies = Vec::new();
+    for task in tasks {
+        let (_id, latency) = task.await.unwrap();
+        latencies.push(latency);
+    }
+
+    let total_time = start.elapsed();
+    println!(" Done in {:.2}s", total_time.as_secs_f64());
+
+    (total_time, latencies)
+}
+
+fn calculate_percentiles(latencies: &mut [Duration]) -> (Duration, Duration, Duration) {
+    latencies.sort();
+    let len = latencies.len();
+    let p50 = latencies[len / 2];
+    let p95 = latencies[len * 95 / 100];
+    let p99 = latencies[len * 99 / 100];
+    (p50, p95, p99)
+}
+
+fn print_results(_name: &str, total: Duration, latencies: &mut [Duration]) {
+    let (p50, p95, p99) = calculate_percentiles(latencies);
+    let throughput = latencies.len() as f64 / total.as_secs_f64();
+
+    println!("  Total time:     {:.2}s", total.as_secs_f64());
+    println!("  Throughput:     {:.1} tasks/s", throughput);
+    println!("  Latency p50:    {:.2}ms", p50.as_secs_f64() * 1000.0);
+    println!("  Latency p95:    {:.2}ms", p95.as_secs_f64() * 1000.0);
+    println!("  Latency p99:    {:.2}ms", p99.as_secs_f64() * 1000.0);
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    println!("Async Throughput Demonstration");
+    println!("==================================\n");
+    println!("This demo shows how compute-intensive work affects async task throughput.\n");
+
+    // Create compute pool directly
+    let compute_config = ComputeConfig {
+        num_threads: Some(4),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "demo".to_string(),
+        pin_threads: false,
+    };
+    let pool = Arc::new(ComputePool::new(compute_config)?);
+
+    println!("Configuration:");
+    println!("  Rayon threads: {}", pool.num_threads());
+
+    // Test parameters
+    let num_tasks = 100;
+    let io_delay = Duration::from_millis(10);
+
+    println!("\nTest: {} concurrent async tasks", num_tasks);
+    println!("Each task: 10ms I/O → compute → 10ms I/O");
+    println!("Expected minimum time: ~20ms (if no blocking)");
+
+    // Test with different compute loads
+    for n in [10, 1_000, 100_000] {
+        println!("\n{:=<60}", "");
+        println!("Compute load: n={} (prime sum)", n);
+        println!("{:=<60}", "");
+
+        // Measure compute time alone
+        let compute_start = Instant::now();
+        let _ = compute_primes_sum(n);
+        let compute_time = compute_start.elapsed();
+        println!(
+            "Pure compute time: {:.2}ms",
+            compute_time.as_secs_f64() * 1000.0
+        );
+
+        // Test 1: Inline execution (blocks async runtime)
+        let (total1, mut latencies1) = run_throughput_test(
+            "Inline (blocks runtime)",
+            num_tasks,
+            n,
+            io_delay,
+            None,
+            "inline",
+        )
+        .await;
+        print_results("Inline", total1, &mut latencies1);
+
+        // Test 2: Rayon offload
+        let (total2, mut latencies2) = run_throughput_test(
+            "Rayon offload",
+            num_tasks,
+            n,
+            io_delay,
+            Some(pool.clone()),
+            "rayon",
+        )
+        .await;
+        print_results("Rayon", total2, &mut latencies2);
+
+        // Test 3: spawn_blocking
+        let (total3, mut latencies3) = run_throughput_test(
+            "spawn_blocking",
+            num_tasks,
+            n,
+            io_delay,
+            None,
+            "spawn_blocking",
+        )
+        .await;
+        print_results("spawn_blocking", total3, &mut latencies3);
+
+        // Analysis
+        println!("\n Impact Analysis:");
+        let speedup_rayon = total1.as_secs_f64() / total2.as_secs_f64();
+        let speedup_spawn = total1.as_secs_f64() / total3.as_secs_f64();
+
+        println!(
+            "  Rayon vs Inline:          {:.2}x throughput",
+            speedup_rayon
+        );
+        println!(
+            "  spawn_blocking vs Inline: {:.2}x throughput",
+            speedup_spawn
+        );
+
+        if compute_time.as_millis() > 1 {
+            let blocking_factor = compute_time.as_secs_f64() / io_delay.as_secs_f64();
+            println!(
+                "\n  Compute time ({:.1}ms) is {:.1}x the I/O time",
+                compute_time.as_secs_f64() * 1000.0,
+                blocking_factor
+            );
+            println!("     This severely impacts async concurrency when run inline!");
+        }
+    }
+
+    // Show pool metrics
+    println!("\n Compute Pool Metrics:");
+    println!("========================");
+    println!("{}", pool.metrics());
+
+    println!("\n Conclusion:");
+    println!("==============");
+    println!("• Small compute (n=10): Overhead may not justify offloading");
+    println!("• Medium compute (n=1000): Offloading preserves async throughput");
+    println!("• Large compute (n=100000): Offloading is essential for responsiveness");
+    println!("\nKey insight: Even small amounts of blocking compute can destroy");
+    println!("async throughput when you have many concurrent tasks!");
+
+    Ok(())
+}
--- a/lib/runtime/examples/async_vs_compute_interaction.rs
+++ b/lib/runtime/examples/async_vs_compute_interaction.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Benchmark demonstrating async I/O vs compute workload interaction
+//!
+//! This example measures how different types of compute workloads interfere with
+//! async I/O latency by comparing actual elapsed time vs expected sleep time.
+//!
+//! Key measurements:
+//! - Baseline async overhead with no compute load
+//! - Interference from small (<100μs), medium (~500μs), and large (~2-5ms) compute tasks
+//! - Comparison between all-async (4 Tokio threads) vs hybrid (2 Tokio + 2 Rayon)
+//! - Impact of offloading compute work to dedicated Rayon threads
+//!
+//! The benchmark spawns many lightweight async tasks doing timed sleeps, then runs
+//! a fixed compute workload while measuring how much the compute work delays the
+//! async tasks from being revisited after their sleeps complete.
+//!
+//! Two configurations are tested with EXACTLY 4 total threads:
+//! 1. All-Async: 4 Tokio threads (compute runs inline, blocking async work)
+//! 2. Hybrid: 2 Tokio threads + 2 Rayon threads (compute offloaded, async stays responsive)
+
+use anyhow::Result;
+use dynamo_runtime::{
+    Runtime,
+    compute::{ComputeConfig, ComputePool},
+    compute_large, compute_medium, compute_small,
+};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+use tokio::sync::Semaphore;
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+
+/// Sleep latency measurement
+#[derive(Debug, Clone)]
+struct SleepMeasurement {
+    _expected_ms: f64,
+    _actual_ms: f64,
+    overhead_ms: f64, // actual - expected
+}
+
+/// Statistics for latency measurements
+#[derive(Debug, Clone)]
+struct LatencyStats {
+    p50: f64,
+    p95: f64,
+    p99: f64,
+    max: f64,
+    mean: f64,
+    count: usize,
+}
+
+/// Test results for a single configuration
+#[derive(Debug)]
+struct TestResults {
+    baseline_overhead: LatencyStats,
+    compute_overhead: Option<LatencyStats>,
+    compute_duration: Option<Duration>,
+    _total_sleep_measurements: usize,
+}
+
+/// Type of workload to run
+#[derive(Debug, Clone, Copy, PartialEq)]
+enum WorkloadType {
+    None,   // No compute (baseline)
+    Small,  // 100% small tasks
+    Medium, // 100% medium tasks
+    Large,  // 100% large tasks
+    Mixed,  // 33/33/33 mix
+}
+
+/// Individual task type
+#[derive(Debug, Clone, Copy)]
+enum TaskType {
+    Small,
+    Medium,
+    Large,
+}
+
+/// Compute-intensive function: sum of all primes up to n
+fn compute_primes_sum(n: u64) -> u64 {
+    let mut sum = 0u64;
+    for candidate in 2..=n {
+        if is_prime(candidate) {
+            sum += candidate;
+        }
+    }
+    sum
+}
+
+fn is_prime(n: u64) -> bool {
+    if n <= 1 {
+        return false;
+    }
+    if n <= 3 {
+        return true;
+    }
+    if n.is_multiple_of(2) || n.is_multiple_of(3) {
+        return false;
+    }
+
+    let sqrt_n = (n as f64).sqrt() as u64;
+    for i in (5..=sqrt_n).step_by(6) {
+        if n.is_multiple_of(i) || n.is_multiple_of(i + 2) {
+            return false;
+        }
+    }
+    true
+}
+
+// Global tuned values (set during calibration)
+static mut SMALL_N: u64 = 1_500;
+static mut MEDIUM_N: u64 = 20_000;
+static mut LARGE_N: u64 = 120_000;
+
+/// Small compute task (~10μs)
+fn small_compute() -> u64 {
+    unsafe { compute_primes_sum(SMALL_N) }
+}
+
+/// Medium compute task (~500μs)
+fn medium_compute() -> u64 {
+    unsafe { compute_primes_sum(MEDIUM_N) }
+}
+
+/// Large compute task (~2-5ms)
+fn large_compute() -> u64 {
+    unsafe { compute_primes_sum(LARGE_N) }
+}
+
+/// Dynamically tune a compute function to hit target time
+fn tune_compute_n(target_us: f64, initial_n: u64, name: &str) -> u64 {
+    let mut n = initial_n;
+    let mut best_n = n;
+    let mut best_diff = f64::MAX;
+
+    // Binary search for the right value
+    let mut low = 10u64;
+    let mut high = 1_000_000u64;
+
+    for _ in 0..20 {
+        // Max 20 iterations
+        n = (low + high) / 2;
+
+        // Measure this n value (average of 3 runs for stability)
+        let mut total_time = Duration::ZERO;
+        for _ in 0..3 {
+            let start = Instant::now();
+            let _ = compute_primes_sum(n);
+            total_time += start.elapsed();
+        }
+        let elapsed_us = total_time.as_secs_f64() * 1_000_000.0 / 3.0;
+
+        let diff = (elapsed_us - target_us).abs();
+        if diff < best_diff {
+            best_diff = diff;
+            best_n = n;
+        }
+
+        // Check if we're close enough (within 20%)
+        if diff / target_us < 0.20 {
+            println!(
+                "  ✓ {} tuned to n={} ({:.1}μs, target {:.0}μs)",
+                name, n, elapsed_us, target_us
+            );
+            return n;
+        }
+
+        // Adjust search range
+        if elapsed_us < target_us {
+            low = n + 1;
+        } else {
+            high = n - 1;
+        }
+
+        if low > high {
+            break;
+        }
+    }
+
+    // Use best found value
+    let start = Instant::now();
+    let _ = compute_primes_sum(best_n);
+    let final_time = start.elapsed().as_secs_f64() * 1_000_000.0;
+    println!(
+        "  ✓ {} tuned to n={} ({:.1}μs, target {:.0}μs)",
+        name, best_n, final_time, target_us
+    );
+    best_n
+}
+
+/// Calibrate compute functions to measure actual execution times
+fn calibrate_compute_functions() {
+    println!("\n Dynamically calibrating compute functions for this machine...");
+    println!("{:-<60}", "");
+
+    // Tune each function
+    unsafe {
+        SMALL_N = tune_compute_n(10.0, SMALL_N, "Small");
+        MEDIUM_N = tune_compute_n(500.0, MEDIUM_N, "Medium");
+        LARGE_N = tune_compute_n(3000.0, LARGE_N, "Large"); // Target 3ms (middle of 2-5ms range)
+    }
+
+    println!();
+    println!("For future runs on this machine, you can use:");
+    println!("     SMALL_N  = {}", unsafe { SMALL_N });
+    println!("     MEDIUM_N = {}", unsafe { MEDIUM_N });
+    println!("     LARGE_N  = {}", unsafe { LARGE_N });
+}
+
+/// Worker that repeatedly sleeps and measures latency
+async fn sleep_worker(
+    sleep_duration: Duration,
+    results: Arc<Mutex<Vec<SleepMeasurement>>>,
+    cancel: CancellationToken,
+) {
+    while !cancel.is_cancelled() {
+        let start = Instant::now();
+        tokio::select! {
+            _ = sleep(sleep_duration) => {
+                let elapsed = start.elapsed();
+                let measurement = SleepMeasurement {
+                    _expected_ms: sleep_duration.as_secs_f64() * 1000.0,
+                    _actual_ms: elapsed.as_secs_f64() * 1000.0,
+                    overhead_ms: (elapsed.as_secs_f64() - sleep_duration.as_secs_f64()) * 1000.0,
+                };
+                results.lock().unwrap().push(measurement);
+            }
+            _ = cancel.cancelled() => break,
+        }
+    }
+}
+
+/// Execute a single compute task based on type
+async fn execute_compute_task(task_type: TaskType, pool: Option<Arc<ComputePool>>) -> Result<u64> {
+    match task_type {
+        TaskType::Small => {
+            // Small tasks always run inline
+            Ok(compute_small!(small_compute()))
+        }
+        TaskType::Medium => {
+            // Medium tasks: offload if pool available, else run inline (blocking)
+            if let Some(pool) = pool.clone() {
+                pool.execute(medium_compute).await
+            } else {
+                // No pool - run inline on Tokio thread (will block!)
+                Ok(medium_compute())
+            }
+        }
+        TaskType::Large => {
+            // Large tasks: offload if pool available, else run inline (severely blocking)
+            if let Some(pool) = pool {
+                pool.execute(large_compute).await
+            } else {
+                // No pool - run inline on Tokio thread (will severely block!)
+                Ok(large_compute())
+            }
+        }
+    }
+}
+
+/// Execute a batch of compute tasks with concurrency limiting
+async fn execute_compute_batch(
+    workload_type: WorkloadType,
+    num_tasks: usize,
+    concurrency_limit: Arc<Semaphore>,
+    pool: Option<Arc<ComputePool>>,
+) -> Duration {
+    if workload_type == WorkloadType::None {
+        return Duration::from_secs(0);
+    }
+
+    let start = Instant::now();
+    let mut handles = Vec::new();
+
+    for i in 0..num_tasks {
+        let permit = concurrency_limit.clone().acquire_owned().await.unwrap();
+        let pool = pool.clone();
+
+        let task_type = match workload_type {
+            WorkloadType::Small => TaskType::Small,
+            WorkloadType::Medium => TaskType::Medium,
+            WorkloadType::Large => TaskType::Large,
+            WorkloadType::Mixed => {
+                // Round-robin: 33% small, 33% medium, 33% large
+                match i % 3 {
+                    0 => TaskType::Small,
+                    1 => TaskType::Medium,
+                    _ => TaskType::Large,
+                }
+            }
+            WorkloadType::None => unreachable!(),
+        };
+
+        let handle = tokio::spawn(async move {
+            let _permit = permit; // Hold permit until task completes
+            execute_compute_task(task_type, pool).await
+        });
+        handles.push(handle);
+    }
+
+    // Wait for all compute tasks
+    for handle in handles {
+        handle.await.unwrap().unwrap();
+    }
+
+    start.elapsed()
+}
+
+/// Calculate statistics from measurements
+fn calculate_stats(measurements: &[SleepMeasurement]) -> LatencyStats {
+    if measurements.is_empty() {
+        return LatencyStats {
+            p50: 0.0,
+            p95: 0.0,
+            p99: 0.0,
+            max: 0.0,
+            mean: 0.0,
+            count: 0,
+        };
+    }
+
+    let mut overheads: Vec<f64> = measurements.iter().map(|m| m.overhead_ms).collect();
+    overheads.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+    let len = overheads.len();
+    LatencyStats {
+        p50: overheads[len / 2],
+        p95: overheads[len * 95 / 100],
+        p99: overheads[len * 99 / 100],
+        max: *overheads.last().unwrap(),
+        mean: overheads.iter().sum::<f64>() / len as f64,
+        count: len,
+    }
+}
+
+/// Run a single interference test
+async fn run_interference_test(
+    _runtime: Arc<Runtime>,
+    workload_type: WorkloadType,
+    pool: Option<Arc<ComputePool>>,
+) -> TestResults {
+    // Configuration
+    const NUM_SLEEP_TASKS: usize = 100;
+    const SLEEP_DURATION_MS: u64 = 1;
+    const NUM_COMPUTE_TASKS: usize = 2000; // Increased for longer workload
+    const CONCURRENCY_LIMIT: usize = 8; // Allow more parallel work
+    const BASELINE_DURATION_SECS: u64 = 1;
+
+    // 1. Start async load (100 tasks doing 1ms sleeps)
+    let results = Arc::new(Mutex::new(Vec::new()));
+    let cancel = CancellationToken::new();
+    let mut handles = Vec::new();
+
+    for _ in 0..NUM_SLEEP_TASKS {
+        let r = results.clone();
+        let c = cancel.clone();
+        handles.push(tokio::spawn(sleep_worker(
+            Duration::from_millis(SLEEP_DURATION_MS),
+            r,
+            c,
+        )));
+    }
+
+    // 2. Collect baseline measurements
+    sleep(Duration::from_secs(BASELINE_DURATION_SECS)).await;
+    let baseline_count = results.lock().unwrap().len();
+
+    // 3. Run compute workload (if not baseline)
+    let compute_duration = if workload_type != WorkloadType::None {
+        let semaphore = Arc::new(Semaphore::new(CONCURRENCY_LIMIT));
+        Some(execute_compute_batch(workload_type, NUM_COMPUTE_TASKS, semaphore, pool).await)
+    } else {
+        // For baseline, just wait another second
+        sleep(Duration::from_secs(1)).await;
+        None
+    };
+
+    // 4. Stop async load
+    cancel.cancel();
+    for handle in handles {
+        handle.await.unwrap();
+    }
+
+    // 5. Analyze results
+    let all_measurements = results.lock().unwrap().clone();
+    let baseline = &all_measurements[..baseline_count.min(all_measurements.len())];
+    let during_compute = if baseline_count < all_measurements.len() {
+        Some(&all_measurements[baseline_count..])
+    } else {
+        None
+    };
+
+    TestResults {
+        baseline_overhead: calculate_stats(baseline),
+        compute_overhead: during_compute.map(calculate_stats),
+        compute_duration,
+        _total_sleep_measurements: all_measurements.len(),
+    }
+}
+
+/// Run all test workloads for a given configuration
+async fn run_all_tests(pool: Option<Arc<ComputePool>>) -> Result<()> {
+    let workload_types = vec![
+        ("Baseline (no compute)", WorkloadType::None),
+        ("100% Small (~10μs each)", WorkloadType::Small),
+        ("100% Medium (~500μs each)", WorkloadType::Medium),
+        ("100% Large (~2-5ms each)", WorkloadType::Large),
+        ("Mixed 33/33/33", WorkloadType::Mixed),
+    ];
+
+    // Create dummy runtime for the test functions
+    let runtime = Arc::new(Runtime::from_current()?);
+
+    for (name, workload) in &workload_types {
+        println!("\n Workload: {}", name);
+        println!("{:-<50}", "");
+
+        let results = run_interference_test(runtime.clone(), *workload, pool.clone()).await;
+
+        // Always show baseline overhead
+        println!("  Baseline async overhead (first {}s):", 1);
+        println!(
+            "    Mean: {:.3}ms, P50: {:.3}ms, P95: {:.3}ms, P99: {:.3}ms",
+            results.baseline_overhead.mean,
+            results.baseline_overhead.p50,
+            results.baseline_overhead.p95,
+            results.baseline_overhead.p99
+        );
+        println!("    Measurements: {}", results.baseline_overhead.count);
+
+        // Show compute interference if applicable
+        if let Some(compute_overhead) = results.compute_overhead {
+            println!("\n  During compute workload:");
+            println!(
+                "    Mean: {:.3}ms, P50: {:.3}ms, P95: {:.3}ms, P99: {:.3}ms",
+                compute_overhead.mean,
+                compute_overhead.p50,
+                compute_overhead.p95,
+                compute_overhead.p99
+            );
+            println!(
+                "    Max: {:.3}ms, Measurements: {}",
+                compute_overhead.max, compute_overhead.count
+            );
+
+            // Calculate interference factor
+            if results.baseline_overhead.mean > 0.0 {
+                let interference_factor = compute_overhead.mean / results.baseline_overhead.mean;
+                println!(
+                    "\n   Interference factor: {:.1}x slower",
+                    interference_factor
+                );
+
+                // Provide interpretation
+                let impact = if interference_factor < 2.0 {
+                    "Minimal - async remains responsive"
+                } else if interference_factor < 10.0 {
+                    "Moderate - noticeable async delays"
+                } else {
+                    "SEVERE - async tasks are heavily blocked!"
+                };
+                println!("     Impact: {}", impact);
+            }
+        }
+
+        // Show compute duration
+        if let Some(duration) = results.compute_duration {
+            println!(
+                "\n  Compute workload completed in: {:.2}s",
+                duration.as_secs_f64()
+            );
+            println!(
+                "  Throughput: {:.0} tasks/sec",
+                1000.0 / duration.as_secs_f64()
+            );
+        }
+    }
+
+    Ok(())
+}
+
+fn main() -> Result<()> {
+    println!(" Async vs Compute Interaction Benchmark");
+    println!("==========================================");
+    println!();
+    println!("This benchmark measures how compute workloads interfere with async I/O latency.");
+    println!("We test with EXACTLY 4 total threads in two configurations:");
+    println!("  1. All-Async: 4 Tokio threads (compute blocks async work)");
+    println!("  2. Hybrid: 2 Tokio + 2 Rayon threads (compute offloaded)");
+    println!("  3. Bonus: Thread-local macro demonstration");
+    println!();
+    println!("Lower overhead numbers mean better async responsiveness.");
+
+    // Calibrate compute functions
+    calibrate_compute_functions();
+
+    // Test 1: All Async (4 Tokio threads, no Rayon)
+    println!("\n{:=<70}", "");
+    println!("Configuration 1: All-Async (4 Tokio threads, no Rayon)");
+    println!("{:=<70}", "");
+    println!("  Compute tasks run INLINE on Tokio threads, blocking async work!");
+
+    let all_async_rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(4)
+        .thread_name("tokio-worker")
+        .enable_all()
+        .build()?;
+
+    all_async_rt.block_on(async {
+        // No compute pool for all-async mode
+        // All compute work will run inline on Tokio threads
+        run_all_tests(None).await
+    })?;
+
+    // Test 2: Hybrid (2 Tokio + 2 Rayon)
+    println!("\n{:=<70}", "");
+    println!("Configuration 2: Hybrid (2 Tokio + 2 Rayon threads)");
+    println!("{:=<70}", "");
+    println!(" Compute tasks offloaded to Rayon, keeping async threads free!");
+
+    let hybrid_rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(2)
+        .thread_name("tokio-worker")
+        .enable_all()
+        .build()?;
+
+    // Create Rayon pool with 2 threads
+    let compute_pool = Arc::new(ComputePool::new(ComputeConfig {
+        num_threads: Some(2),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "rayon".to_string(),
+        pin_threads: false,
+    })?);
+
+    hybrid_rt.block_on(async { run_all_tests(Some(compute_pool)).await })?;
+
+    // Summary
+    println!("\n{:=<70}", "");
+    println!(" Key Takeaway");
+    println!("{:=<70}", "");
+    println!();
+    println!("The benchmark demonstrates that offloading compute work to dedicated");
+    println!("threads becomes increasingly important as task duration increases.");
+    println!("Look at the interference factors above to see the actual impact.");
+
+    // Test 3: Demonstrate thread-local macros
+    println!("\n{:=<70}", "");
+    println!("Configuration 3: Thread-Local Macro Demonstration");
+    println!("{:=<70}", "");
+    println!("Testing thread-local compute context initialization...");
+
+    // Create a runtime with thread-local setup
+    let macro_rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(2)
+        .thread_name("macro-demo")
+        .enable_all()
+        .build()?;
+
+    // Create compute pool for macros
+    let macro_pool = Arc::new(ComputePool::new(ComputeConfig {
+        num_threads: Some(2),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "macro-compute".to_string(),
+        pin_threads: false,
+    })?);
+
+    macro_rt.block_on(async {
+        // We can't directly create a Runtime with private fields, so we'll
+        // initialize thread-local context manually using a barrier approach
+
+        // Set up semaphore permits
+        let permits = Arc::new(tokio::sync::Semaphore::new(1)); // 2 workers - 1
+
+        // Detect number of worker threads
+        use std::collections::HashSet;
+        use std::sync::Mutex;
+
+        let thread_ids = Arc::new(Mutex::new(HashSet::new()));
+        let mut handles = Vec::new();
+
+        // Probe to find worker thread count
+        for _ in 0..50 {
+            let ids = Arc::clone(&thread_ids);
+            let handle = tokio::task::spawn_blocking(move || {
+                let thread_id = std::thread::current().id();
+                ids.lock().unwrap().insert(thread_id);
+            });
+            handles.push(handle);
+        }
+
+        for handle in handles {
+            let _ = handle.await;
+        }
+
+        let num_workers = thread_ids.lock().unwrap().len();
+        println!("  Detected {} worker threads", num_workers);
+
+        // Now initialize thread-local on all workers using a barrier
+        let barrier = Arc::new(std::sync::Barrier::new(num_workers));
+        let mut init_handles = Vec::new();
+
+        for i in 0..num_workers {
+            let barrier_clone = Arc::clone(&barrier);
+            let pool_clone = Arc::clone(&macro_pool);
+            let permits_clone = Arc::clone(&permits);
+
+            let handle = tokio::task::spawn_blocking(move || {
+                // Wait at barrier
+                barrier_clone.wait();
+
+                // Initialize thread-local
+                dynamo_runtime::compute::thread_local::initialize_context(
+                    pool_clone,
+                    permits_clone,
+                );
+                println!("  Initialized thread-local on worker {}", i);
+            });
+            init_handles.push(handle);
+        }
+
+        for handle in init_handles {
+            handle.await?;
+        }
+
+        // Test if macros work
+        println!("\n Testing thread-local macros:");
+
+        if dynamo_runtime::compute::thread_local::has_compute_context() {
+            println!("  Thread-local context is available!");
+
+            // Test compute_small! macro
+            println!("\n  Testing compute_small! (inline):");
+            let start = std::time::Instant::now();
+            let result = compute_small!(small_compute());
+            println!("    Result: {}, Time: {:?}", result, start.elapsed());
+
+            // Test compute_medium! macro (would use thread-local context)
+            println!("\n  Testing compute_medium! (block_in_place or offload):");
+            let start = std::time::Instant::now();
+            let result = compute_medium!(medium_compute());
+            println!("    Result: {}, Time: {:?}", result, start.elapsed());
+
+            // Test compute_large! macro (would use thread-local context)
+            println!("\n  Testing compute_large! (always offload):");
+            let start = std::time::Instant::now();
+            let result = compute_large!(large_compute());
+            println!("    Result: {}, Time: {:?}", result, start.elapsed());
+
+            println!("\n  All macros work with thread-local context!");
+        } else {
+            println!("  Thread-local context NOT available - macros would fail");
+        }
+
+        Ok::<_, anyhow::Error>(())
+    })?;
+
+    println!();
+    println!(" Benchmark complete!");
+
+    Ok(())
+}
--- a/lib/runtime/examples/compute_overhead_demo.rs
+++ b/lib/runtime/examples/compute_overhead_demo.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use anyhow::Result;
+use dynamo_runtime::compute::{ComputeConfig, ComputePool};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+/// Compute-intensive function: sum of all primes up to n
+fn compute_primes_sum(n: u64) -> u64 {
+    let mut sum = 0u64;
+    for candidate in 2..=n {
+        if is_prime(candidate) {
+            sum += candidate;
+        }
+    }
+    sum
+}
+
+fn is_prime(n: u64) -> bool {
+    if n <= 1 {
+        return false;
+    }
+    if n <= 3 {
+        return true;
+    }
+    if n.is_multiple_of(2) || n.is_multiple_of(3) {
+        return false;
+    }
+
+    let sqrt_n = (n as f64).sqrt() as u64;
+    for i in (5..=sqrt_n).step_by(6) {
+        if n.is_multiple_of(i) || n.is_multiple_of(i + 2) {
+            return false;
+        }
+    }
+    true
+}
+
+async fn measure_direct(n: u64) -> Duration {
+    let start = Instant::now();
+    let _ = compute_primes_sum(n);
+    start.elapsed()
+}
+
+async fn measure_rayon(pool: &ComputePool, n: u64) -> Duration {
+    let start = Instant::now();
+    let _ = pool.execute(move || compute_primes_sum(n)).await.unwrap();
+    start.elapsed()
+}
+
+async fn measure_spawn_blocking(n: u64) -> Duration {
+    let start = Instant::now();
+    let _ = tokio::task::spawn_blocking(move || compute_primes_sum(n))
+        .await
+        .unwrap();
+    start.elapsed()
+}
+
+fn format_duration(d: Duration) -> String {
+    if d.as_secs() > 0 {
+        format!("{:.2}s", d.as_secs_f64())
+    } else if d.as_millis() > 0 {
+        format!("{:.2}ms", d.as_secs_f64() * 1000.0)
+    } else if d.as_micros() > 0 {
+        format!("{:.2}μs", d.as_secs_f64() * 1_000_000.0)
+    } else {
+        format!("{}ns", d.as_nanos())
+    }
+}
+
+fn print_table_header() {
+    println!("\n{:=<120}", "");
+    println!(
+        "{:>10} | {:>15} | {:>15} | {:>15} | {:>12} | {:>12} | {:>20}",
+        "n", "Direct", "Rayon", "spawn_blocking", "Rayon Ratio", "Spawn Ratio", "Winner"
+    );
+    println!("{:-<120}", "");
+}
+
+fn print_row(
+    n: u64,
+    direct: Duration,
+    rayon: Duration,
+    spawn_blocking: Duration,
+    highlight_crossover: bool,
+) {
+    let rayon_ratio = rayon.as_secs_f64() / direct.as_secs_f64();
+    let spawn_ratio = spawn_blocking.as_secs_f64() / direct.as_secs_f64();
+
+    let winner = if rayon_ratio < 1.0 && rayon_ratio < spawn_ratio {
+        "Rayon ✓"
+    } else if spawn_ratio < 1.0 && spawn_ratio < rayon_ratio {
+        "spawn_blocking"
+    } else {
+        "Direct"
+    };
+
+    let row = format!(
+        "{:>10} | {:>15} | {:>15} | {:>15} | {:>12.2}x | {:>12.2}x | {:>20}",
+        n,
+        format_duration(direct),
+        format_duration(rayon),
+        format_duration(spawn_blocking),
+        rayon_ratio,
+        spawn_ratio,
+        winner
+    );
+
+    if highlight_crossover {
+        println!(">>> {} <<<", row);
+    } else {
+        println!("{}", row);
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    println!("🔬 Compute Pool Overhead Demonstration");
+    println!("=====================================\n");
+
+    // Create compute pool directly
+    let compute_config = ComputeConfig {
+        num_threads: Some(4),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "demo".to_string(),
+        pin_threads: false,
+    };
+    let pool = Arc::new(ComputePool::new(compute_config)?);
+
+    println!("Configuration:");
+    println!("  Rayon threads: {}", pool.num_threads());
+    println!();
+
+    // Warm up all execution paths
+    println!("Warming up...");
+    for _ in 0..5 {
+        let _ = measure_direct(100).await;
+        let _ = measure_rayon(&pool, 100).await;
+        let _ = measure_spawn_blocking(100).await;
+    }
+
+    print_table_header();
+
+    // Dynamic scanning with exponential growth
+    let mut n = 10u64;
+    let mut results = Vec::new();
+    let mut found_crossover = false;
+    let mut last_rayon_ratio = f64::MAX;
+
+    while n <= 1_000_000 {
+        // Measure each approach multiple times and take the minimum
+        let mut direct_times = Vec::new();
+        for _ in 0..3 {
+            direct_times.push(measure_direct(n).await);
+        }
+        let direct = direct_times.into_iter().min().unwrap();
+
+        let mut rayon_times = Vec::new();
+        for _ in 0..3 {
+            rayon_times.push(measure_rayon(&pool, n).await);
+        }
+        let rayon = rayon_times.into_iter().min().unwrap();
+
+        let mut spawn_times = Vec::new();
+        for _ in 0..3 {
+            spawn_times.push(measure_spawn_blocking(n).await);
+        }
+        let spawn_blocking = spawn_times.into_iter().min().unwrap();
+
+        let rayon_ratio = rayon.as_secs_f64() / direct.as_secs_f64();
+
+        // Detect crossover point
+        let is_crossover = !found_crossover && rayon_ratio < 1.0 && last_rayon_ratio >= 1.0;
+        if is_crossover {
+            found_crossover = true;
+        }
+
+        print_row(n, direct, rayon, spawn_blocking, is_crossover);
+        results.push((n, direct, rayon, spawn_blocking));
+
+        last_rayon_ratio = rayon_ratio;
+
+        // Adaptive step size
+        if n < 100 {
+            n = (n as f64 * 2.0) as u64;
+        } else if n < 10_000 {
+            n = (n as f64 * 3.16) as u64; // ~10x every 2 steps
+        } else {
+            n *= 10;
+        }
+    }
+
+    println!("{:=<120}", "");
+
+    // Analysis
+    println!("\n Analysis:");
+    println!("============\n");
+
+    if found_crossover {
+        let crossover_point = results
+            .iter()
+            .find(|(_, d, r, _)| r.as_secs_f64() < d.as_secs_f64())
+            .map(|(n, _, _, _)| *n);
+
+        if let Some(n) = crossover_point {
+            println!("✓ Rayon becomes beneficial at n ≈ {}", n);
+            println!(
+                "  Below n={}: Overhead dominates, direct execution is faster",
+                n
+            );
+            println!(
+                "  Above n={}: Compute dominates, Rayon offload is faster",
+                n
+            );
+        }
+    } else {
+        println!("✗ No crossover found in tested range");
+        println!("  Direct execution was always faster (overhead too high)");
+    }
+
+    // Find where spawn_blocking becomes beneficial
+    let spawn_crossover = results
+        .iter()
+        .find(|(_, d, _, s)| s.as_secs_f64() < d.as_secs_f64())
+        .map(|(n, _, _, _)| *n);
+
+    if let Some(n) = spawn_crossover {
+        println!("\n✓ spawn_blocking becomes beneficial at n ≈ {}", n);
+    }
+
+    // Show overhead at minimum
+    if let Some((n, direct, rayon, spawn)) = results.first() {
+        let rayon_overhead = rayon.as_secs_f64() - direct.as_secs_f64();
+        let spawn_overhead = spawn.as_secs_f64() - direct.as_secs_f64();
+        println!("\nOverhead at n={}:", n);
+        println!(
+            "  Rayon:          +{}",
+            format_duration(Duration::from_secs_f64(rayon_overhead))
+        );
+        println!(
+            "  spawn_blocking: +{}",
+            format_duration(Duration::from_secs_f64(spawn_overhead))
+        );
+    }
+
+    // Show benefit at maximum
+    if let Some((n, direct, rayon, spawn)) = results.last() {
+        let rayon_speedup = direct.as_secs_f64() / rayon.as_secs_f64();
+        let spawn_speedup = direct.as_secs_f64() / spawn.as_secs_f64();
+        println!("\nSpeedup at n={}:", n);
+        println!("  Rayon:          {:.2}x faster", rayon_speedup);
+        println!("  spawn_blocking: {:.2}x faster", spawn_speedup);
+    }
+
+    // Print pool metrics
+    println!("\n Compute Pool Metrics:");
+    println!("========================");
+    println!("{}", pool.metrics());
+
+    Ok(())
+}
--- a/lib/runtime/examples/compute_pool_example.rs
+++ b/lib/runtime/examples/compute_pool_example.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Example demonstrating the use of ComputePool for CPU-intensive operations
+//!
+//! This example shows various patterns for using Rayon with Tokio:
+//! - Fork-join with scope
+//! - Parallel batch processing
+//! - Dynamic task spawning
+//! - Integration with async services
+
+use anyhow::Result;
+use dynamo_runtime::{
+    Worker,
+    compute::{ComputePool, ComputePoolExt},
+};
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+/// Simulate expensive CPU-bound computation
+fn expensive_computation(input: u64) -> u64 {
+    // Simulate work with a simple prime check
+    let mut sum = 0u64;
+    for i in 2..input {
+        if is_prime(i) {
+            sum += i;
+        }
+    }
+    sum
+}
+
+fn is_prime(n: u64) -> bool {
+    if n <= 1 {
+        return false;
+    }
+    for i in 2..((n as f64).sqrt() as u64 + 1) {
+        if n.is_multiple_of(i) {
+            return false;
+        }
+    }
+    true
+}
+
+/// Example 1: Simple fork-join pattern
+async fn example_fork_join(pool: &ComputePool) -> Result<()> {
+    println!("\n=== Example 1: Fork-Join Pattern ===");
+
+    let start = Instant::now();
+
+    // Run two expensive computations in parallel
+    let (result1, result2) = pool
+        .join(
+            || expensive_computation(10000),
+            || expensive_computation(20000),
+        )
+        .await?;
+
+    println!("Fork-join results: {} and {}", result1, result2);
+    println!("Time: {:?}", start.elapsed());
+
+    Ok(())
+}
+
+/// Example 2: Scope-based parallel execution
+async fn example_scope(pool: &ComputePool) -> Result<()> {
+    println!("\n=== Example 2: Scope-based Execution ===");
+
+    let data = [1000, 2000, 3000, 4000, 5000];
+    let start = Instant::now();
+
+    let results = pool
+        .execute_scoped(move |scope| {
+            let results = Arc::new(Mutex::new(vec![0u64; data.len()]));
+
+            for (i, &value) in data.iter().enumerate() {
+                let results = results.clone();
+                scope.spawn(move |_| {
+                    let result = expensive_computation(value);
+                    let mut r = results.lock().unwrap();
+                    r[i] = result;
+                });
+            }
+
+            Arc::try_unwrap(results).unwrap().into_inner().unwrap()
+        })
+        .await?;
+
+    println!("Scope results: {:?}", results);
+    println!("Time: {:?}", start.elapsed());
+
+    Ok(())
+}
+
+/// Example 3: Parallel map using extension trait
+async fn example_parallel_map(pool: &ComputePool) -> Result<()> {
+    println!("\n=== Example 3: Parallel Map ===");
+
+    let items: Vec<u64> = (1..=10).map(|i| i * 1000).collect();
+    let start = Instant::now();
+
+    let results = pool
+        .parallel_map(items.clone(), expensive_computation)
+        .await?;
+
+    println!("Parallel map processed {} items", results.len());
+    println!("Time: {:?}", start.elapsed());
+
+    // Compare with sequential processing
+    let start_seq = Instant::now();
+    let _sequential: Vec<_> = items.iter().map(|&i| expensive_computation(i)).collect();
+    println!("Sequential time: {:?}", start_seq.elapsed());
+
+    Ok(())
+}
+
+/// Example 4: Simulating tokenization workload
+async fn example_tokenization(pool: &ComputePool) -> Result<()> {
+    println!("\n=== Example 4: Batch Tokenization Simulation ===");
+
+    // Simulate batch of texts to tokenize
+    let texts: Vec<String> = (0..100)
+        .map(|i| {
+            format!(
+                "This is sample text number {} that needs to be tokenized",
+                i
+            )
+        })
+        .collect();
+
+    let start = Instant::now();
+    let texts_len = texts.len();
+
+    // Process in parallel using scope
+    let token_counts = pool
+        .execute_scoped(move |scope| {
+            let counts = Arc::new(Mutex::new(vec![0usize; texts_len]));
+
+            for (i, text) in texts.iter().enumerate() {
+                let text = text.clone();
+                let counts = counts.clone();
+                scope.spawn(move |_| {
+                    // Simulate tokenization by counting words
+                    let count = text.split_whitespace().count();
+                    // Simulate more work
+                    std::thread::sleep(std::time::Duration::from_micros(100));
+                    let mut c = counts.lock().unwrap();
+                    c[i] = count;
+                });
+            }
+
+            Arc::try_unwrap(counts).unwrap().into_inner().unwrap()
+        })
+        .await?;
+
+    let total_tokens: usize = token_counts.iter().sum();
+    println!(
+        "Tokenized {} texts, total tokens: {}",
+        texts_len, total_tokens
+    );
+    println!("Time: {:?}", start.elapsed());
+
+    Ok(())
+}
+
+/// Example 5: Hierarchical computation
+async fn example_hierarchical(pool: &ComputePool) -> Result<()> {
+    println!("\n=== Example 5: Hierarchical Computation ===");
+
+    let start = Instant::now();
+
+    let result = pool
+        .execute_scoped(move |scope| {
+            let phase1_results = Arc::new(Mutex::new(vec![0u64; 4]));
+
+            // First level: compute initial values
+            for i in 0..4 {
+                let phase1_results = phase1_results.clone();
+                scope.spawn(move |s2| {
+                    let intermediate = expensive_computation((i + 1) as u64 * 1000);
+
+                    // Second level: further process each result
+                    let phase2_results = Arc::new(Mutex::new(vec![0u64; 2]));
+
+                    for j in 0..2 {
+                        let value = intermediate + (j as u64 * 100);
+                        let phase2_results = phase2_results.clone();
+                        s2.spawn(move |_| {
+                            let result = expensive_computation(value);
+                            let mut r = phase2_results.lock().unwrap();
+                            r[j] = result;
+                        });
+                    }
+
+                    let sum: u64 = phase2_results.lock().unwrap().iter().sum();
+                    let mut p1 = phase1_results.lock().unwrap();
+                    p1[i] = sum;
+                });
+            }
+
+            phase1_results.lock().unwrap().iter().sum::<u64>()
+        })
+        .await?;
+
+    println!("Hierarchical computation result: {}", result);
+    println!("Time: {:?}", start.elapsed());
+
+    Ok(())
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Initialize logging
+    tracing_subscriber::fmt()
+        .with_max_level(tracing::Level::INFO)
+        .init();
+
+    // Create worker and runtime
+    let worker = Worker::from_settings()?;
+    let runtime = worker.runtime().clone();
+
+    // Get compute pool
+    let pool = runtime
+        .compute_pool()
+        .ok_or_else(|| anyhow::anyhow!("Compute pool not initialized"))?
+        .clone();
+
+    println!(
+        "Compute pool initialized with {} threads",
+        pool.num_threads()
+    );
+
+    // Run examples
+    example_fork_join(&pool).await?;
+    example_scope(&pool).await?;
+    example_parallel_map(&pool).await?;
+    example_tokenization(&pool).await?;
+    example_hierarchical(&pool).await?;
+
+    // Print metrics
+    let metrics = pool.metrics();
+    println!("\n=== Compute Pool Metrics ===");
+    println!("{}", metrics);
+
+    Ok(())
+}
--- a/lib/runtime/examples/test_macros.rs
+++ b/lib/runtime/examples/test_macros.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use anyhow::Result;
+use dynamo_runtime::compute::{ComputeConfig, ComputePool};
+use dynamo_runtime::{compute_large, compute_medium, compute_small};
+use std::sync::Arc;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    println!("Testing compute macros...\n");
+
+    // Create compute pool
+    let compute_config = ComputeConfig {
+        num_threads: Some(4),
+        stack_size: Some(2 * 1024 * 1024),
+        thread_prefix: "test".to_string(),
+        pin_threads: false,
+    };
+    let pool = Arc::new(ComputePool::new(compute_config)?);
+
+    // Test small macro (direct execution)
+    println!("Testing compute_small!...");
+    let result = compute_small!(2 + 2);
+    println!("  Result: {}", result);
+
+    // Test medium macro (block_in_place with fallback)
+    println!("\nTesting compute_medium!...");
+    let result = compute_medium!(pool, {
+        let mut sum = 0u64;
+        for i in 0..1000 {
+            sum += i;
+        }
+        sum
+    });
+    println!("  Result: {}", result);
+
+    // Test large macro (always Rayon)
+    println!("\nTesting compute_large!...");
+    let result = compute_large!(pool, {
+        let mut sum = 0u64;
+        for i in 0..1_000_000 {
+            sum += i;
+        }
+        sum
+    });
+    println!("  Result: {}", result);
+
+    println!("\n All macros working!");
+    Ok(())
+}
--- a/lib/runtime/examples/tokenizer_integration.rs
+++ b/lib/runtime/examples/tokenizer_integration.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Example showing how to integrate ComputePool with tokenization workloads
+//!
+//! This demonstrates the pattern that could be used in lib/llm/src/preprocessor.rs
+//! to leverage the compute pool for batch tokenization operations.
+
+use anyhow::Result;
+use dynamo_runtime::{Worker, compute::ComputePool};
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+/// Mock tokenizer for demonstration
+struct MockTokenizer;
+
+impl MockTokenizer {
+    fn encode(&self, text: &str) -> Vec<u32> {
+        // Simulate tokenization work
+        let mut tokens = Vec::new();
+        for (i, word) in text.split_whitespace().enumerate() {
+            // Simulate expensive computation
+            let hash = word
+                .bytes()
+                .fold(0u32, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u32));
+            tokens.push(hash.wrapping_add(i as u32));
+        }
+        tokens
+    }
+
+    fn decode(&self, tokens: &[u32]) -> String {
+        // Simulate detokenization
+        tokens
+            .iter()
+            .map(|t| format!("token_{}", t % 1000))
+            .collect::<Vec<_>>()
+            .join(" ")
+    }
+}
+
+/// Pattern 1: Direct replacement for par_iter in preprocessor
+///
+/// This shows how the existing code in lib/llm/src/preprocessor.rs:330
+/// could be enhanced with explicit compute pool control
+async fn tokenize_batch_with_pool(
+    pool: &ComputePool,
+    tokenizer: Arc<MockTokenizer>,
+    texts: Vec<String>,
+) -> Result<Vec<Vec<u32>>> {
+    println!(
+        "\n=== Tokenizing {} texts with compute pool ===",
+        texts.len()
+    );
+    let start = Instant::now();
+
+    // Option 1: Using scope for fine control
+    let token_batches = pool
+        .execute_scoped(move |scope| {
+            let results = Arc::new(Mutex::new(vec![Vec::new(); texts.len()]));
+
+            for (i, text) in texts.iter().enumerate() {
+                let tokenizer = tokenizer.clone();
+                let text = text.clone();
+                let results = results.clone();
+                scope.spawn(move |_| {
+                    let tokens = tokenizer.encode(&text);
+                    let mut r = results.lock().unwrap();
+                    r[i] = tokens;
+                });
+            }
+
+            Arc::try_unwrap(results).unwrap().into_inner().unwrap()
+        })
+        .await?;
+
+    let total_tokens: usize = token_batches.iter().map(|v| v.len()).sum();
+    println!(
+        "Tokenized in {:?}, total tokens: {}",
+        start.elapsed(),
+        total_tokens
+    );
+
+    Ok(token_batches)
+}
+
+/// Pattern 2: Using rayon's par_iter within the compute pool
+///
+/// This maintains compatibility with existing code patterns
+async fn tokenize_batch_par_iter(
+    pool: &ComputePool,
+    tokenizer: Arc<MockTokenizer>,
+    texts: Vec<String>,
+) -> Result<Vec<Vec<u32>>> {
+    use rayon::prelude::*;
+
+    println!("\n=== Tokenizing with par_iter in compute pool ===");
+    let start = Instant::now();
+
+    // This is how the existing preprocessor code could work
+    let token_batches: Vec<Vec<u32>> = pool
+        .install(move || {
+            texts
+                .par_iter()
+                .map(|text| tokenizer.encode(text))
+                .collect()
+        })
+        .await?;
+
+    let total_tokens: usize = token_batches.iter().map(|v| v.len()).sum();
+    println!(
+        "Tokenized in {:?}, total tokens: {}",
+        start.elapsed(),
+        total_tokens
+    );
+
+    Ok(token_batches)
+}
+
+/// Pattern 3: Mixed async/sync processing
+///
+/// This shows how to handle a stream of requests where each request
+/// contains a batch that needs parallel processing
+async fn process_request_stream(pool: &ComputePool, tokenizer: Arc<MockTokenizer>) -> Result<()> {
+    println!("\n=== Processing request stream ===");
+
+    // Simulate incoming requests
+    let requests = vec![
+        vec![
+            "Request 1 text 1".to_string(),
+            "Request 1 text 2".to_string(),
+        ],
+        vec![
+            "Request 2 text 1".to_string(),
+            "Request 2 text 2".to_string(),
+            "Request 2 text 3".to_string(),
+        ],
+        vec!["Request 3 text 1".to_string()],
+    ];
+
+    for (i, batch) in requests.into_iter().enumerate() {
+        println!("Processing request {}", i + 1);
+
+        // Each request gets processed in parallel
+        let tokens = tokenize_batch_with_pool(pool, tokenizer.clone(), batch).await?;
+
+        // Simulate async I/O between requests
+        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
+
+        println!(
+            "Request {} completed with {} token batches",
+            i + 1,
+            tokens.len()
+        );
+    }
+
+    Ok(())
+}
+
+/// Pattern 4: Encode/Decode pipeline
+///
+/// Shows how to chain multiple compute operations
+async fn encode_decode_pipeline(
+    pool: &ComputePool,
+    tokenizer: Arc<MockTokenizer>,
+    texts: Vec<String>,
+) -> Result<Vec<String>> {
+    println!("\n=== Encode/Decode Pipeline ===");
+    let start = Instant::now();
+
+    // Step 1: Encode all texts in parallel
+    let tokenizer_clone = tokenizer.clone();
+    let encoded = pool
+        .execute_scoped(move |scope| {
+            let results = Arc::new(Mutex::new(vec![Vec::new(); texts.len()]));
+
+            for (i, text) in texts.iter().enumerate() {
+                let tokenizer = tokenizer_clone.clone();
+                let text = text.clone();
+                let results = results.clone();
+                scope.spawn(move |_| {
+                    let tokens = tokenizer.encode(&text);
+                    let mut r = results.lock().unwrap();
+                    r[i] = tokens;
+                });
+            }
+
+            Arc::try_unwrap(results).unwrap().into_inner().unwrap()
+        })
+        .await?;
+
+    println!("Encoding complete in {:?}", start.elapsed());
+
+    // Step 2: Decode all token sequences in parallel
+    let decoded_start = Instant::now();
+    let decoded = pool
+        .execute_scoped(move |scope| {
+            let results = Arc::new(Mutex::new(vec![String::new(); encoded.len()]));
+
+            for (i, tokens) in encoded.iter().enumerate() {
+                let tokenizer = tokenizer.clone();
+                let tokens = tokens.clone();
+                let results = results.clone();
+                scope.spawn(move |_| {
+                    let text = tokenizer.decode(&tokens);
+                    let mut r = results.lock().unwrap();
+                    r[i] = text;
+                });
+            }
+
+            Arc::try_unwrap(results).unwrap().into_inner().unwrap()
+        })
+        .await?;
+
+    println!("Decoding complete in {:?}", decoded_start.elapsed());
+    println!("Total pipeline time: {:?}", start.elapsed());
+
+    Ok(decoded)
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Initialize logging
+    tracing_subscriber::fmt()
+        .with_max_level(tracing::Level::INFO)
+        .init();
+
+    // Set compute pool configuration via environment
+    unsafe {
+        std::env::set_var("DYN_COMPUTE_THREADS", "4");
+    }
+
+    // Create worker and runtime
+    let worker = Worker::from_settings()?;
+    let runtime = worker.runtime().clone();
+
+    // Get compute pool
+    let pool = runtime
+        .compute_pool()
+        .ok_or_else(|| anyhow::anyhow!("Compute pool not initialized"))?
+        .clone();
+
+    println!(
+        "Compute pool initialized with {} threads",
+        pool.num_threads()
+    );
+
+    // Create mock tokenizer
+    let tokenizer = Arc::new(MockTokenizer);
+
+    // Generate test data
+    let texts: Vec<String> = (0..50)
+        .map(|i| {
+            format!(
+                "This is sample text number {} with some words to tokenize. \
+             The quick brown fox jumps over the lazy dog.",
+                i
+            )
+        })
+        .collect();
+
+    // Run examples
+    let _ = tokenize_batch_with_pool(&pool, tokenizer.clone(), texts.clone()).await?;
+    let _ = tokenize_batch_par_iter(&pool, tokenizer.clone(), texts.clone()).await?;
+    process_request_stream(&pool, tokenizer.clone()).await?;
+    let decoded = encode_decode_pipeline(&pool, tokenizer.clone(), texts.clone()).await?;
+
+    println!("\n=== Results ===");
+    println!("Processed {} texts", texts.len());
+    println!("First decoded text: {}", &decoded[0]);
+
+    // Print metrics
+    let metrics = pool.metrics();
+    println!("\n=== Compute Pool Metrics ===");
+    println!("{}", metrics);
+
+    Ok(())
+}
--- a/lib/runtime/src/compute/macros.rs
+++ b/lib/runtime/src/compute/macros.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Zero-overhead macros for compute task execution with optional validation
+//!
+//! These macros provide size-aware execution strategies:
+//! - `compute_small!`: Direct inline execution for tasks <100μs
+//! - `compute_medium!`: Semaphore-guarded block_in_place for tasks 100μs-1ms
+//! - `compute_large!`: Rayon offload for tasks >1ms
+//!
+//! When the `compute-validation` feature is enabled, these macros will
+//! time execution and emit warnings if tasks are misclassified.
+
+/// Execute a small compute task (<100μs) directly inline.
+///
+/// This macro has zero overhead and simply executes the expression directly.
+/// When validation is enabled, it will warn if the task takes >100μs.
+///
+/// # Example
+/// ```
+/// # use dynamo_runtime::compute_small;
+/// let result = compute_small!(2 + 2);
+/// assert_eq!(result, 4);
+/// ```
+#[macro_export]
+macro_rules! compute_small {
+    ($expr:expr) => {{
+        #[cfg(feature = "compute-validation")]
+        let _start = std::time::Instant::now();
+
+        let result = $expr; // Direct execution, zero overhead
+
+        #[cfg(feature = "compute-validation")]
+        $crate::compute::validation::validate_small(_start.elapsed());
+
+        result
+    }};
+}
+
+/// Execute a medium compute task (100μs-1ms) with intelligent scheduling.
+///
+/// This macro first tries to use thread-local context if available (on Tokio worker threads).
+/// If no thread-local context, it requires a pool parameter.
+///
+/// # Example
+/// ```ignore
+/// # use dynamo_runtime::{compute_medium, compute::ComputePool};
+/// # async fn example(pool: &ComputePool) {
+/// // With thread-local context (on worker thread)
+/// let result = compute_medium!({
+///     (0..1000).map(|i| i * 2).sum::<i32>()
+/// }).await;
+///
+/// // Or with explicit pool (fallback)
+/// let result = compute_medium!(pool, {
+///     (0..1000).map(|i| i * 2).sum::<i32>()
+/// }).await;
+/// # }
+/// ```
+#[macro_export]
+macro_rules! compute_medium {
+    // Thread-local version (no pool parameter)
+    ($expr:expr) => {{
+        #[cfg(feature = "compute-validation")]
+        let _start = std::time::Instant::now();
+
+        let result = async {
+            // Try thread-local context first
+            if let Ok(_permit) = $crate::compute::thread_local::try_acquire_block_permit() {
+                // Got permit - use block_in_place
+                Ok(tokio::task::block_in_place(|| {
+                    let r = $expr;
+                    drop(_permit); // Release ASAP
+                    r
+                }))
+            } else if let Some(pool) = $crate::compute::thread_local::get_pool() {
+                // No permit but have pool - offload
+                pool.execute(|| $expr).await
+            } else {
+                // No context available - fall back to inline execution
+                // This may block the async runtime but ensures the macro always works
+                tracing::warn!("compute_medium: No thread-local context, executing inline (may block async runtime)");
+                Ok($expr)
+            }
+        }
+        .await?;
+
+        #[cfg(feature = "compute-validation")]
+        $crate::compute::validation::validate_medium(_start.elapsed());
+
+        result
+    }};
+
+    // Explicit pool version (fallback)
+    ($pool:expr, $expr:expr) => {{
+        #[cfg(feature = "compute-validation")]
+        let _start = std::time::Instant::now();
+
+        let result = async {
+            // Try thread-local permits first, fall back to pool
+            if let Ok(_permit) = $crate::compute::thread_local::try_acquire_block_permit() {
+                // Got permit - use block_in_place
+                Ok(tokio::task::block_in_place(|| {
+                    let r = $expr;
+                    drop(_permit); // Release ASAP
+                    r
+                }))
+            } else {
+                // No permit available - offload to provided pool
+                $pool.execute(|| $expr).await
+            }
+        }
+        .await?;
+
+        #[cfg(feature = "compute-validation")]
+        $crate::compute::validation::validate_medium(_start.elapsed());
+
+        result
+    }};
+}
+
+/// Execute a large compute task (>1ms) on the Rayon thread pool.
+///
+/// This macro always offloads to Rayon as the overhead is negligible
+/// compared to the computation time.
+///
+/// # Example
+/// ```ignore
+/// # use dynamo_runtime::{compute_large, compute::ComputePool};
+/// # async fn example(pool: &ComputePool) {
+/// // With thread-local context
+/// let result = compute_large!({
+///     expensive_matrix_multiplication()
+/// }).await;
+///
+/// // Or with explicit pool
+/// let result = compute_large!(pool, {
+///     expensive_matrix_multiplication()
+/// }).await;
+/// # }
+/// ```
+#[macro_export]
+macro_rules! compute_large {
+    // Thread-local version
+    ($expr:expr) => {{
+        #[cfg(feature = "compute-validation")]
+        let _start = std::time::Instant::now();
+
+        let result = async {
+            if let Some(pool) = $crate::compute::thread_local::get_pool() {
+                pool.execute(|| $expr).await
+            } else {
+                // No pool available - fall back to inline execution
+                // Warning: Large tasks inline will severely block the async runtime
+                tracing::warn!("compute_large: No thread-local context, executing inline (will block async runtime!)");
+                Ok($expr)
+            }
+        }
+        .await?;
+
+        #[cfg(feature = "compute-validation")]
+        $crate::compute::validation::validate_large(_start.elapsed());
+
+        result
+    }};
+
+    // Explicit pool version
+    ($pool:expr, $expr:expr) => {{
+        #[cfg(feature = "compute-validation")]
+        let _start = std::time::Instant::now();
+
+        let result = $pool.execute(|| $expr).await?;
+
+        #[cfg(feature = "compute-validation")]
+        $crate::compute::validation::validate_large(_start.elapsed());
+
+        result
+    }};
+}
--- a/lib/runtime/src/compute/metrics.rs
+++ b/lib/runtime/src/compute/metrics.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Metrics for monitoring compute pool operations
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::time::Duration;
+
+/// Metrics for the compute pool
+#[derive(Debug)]
+pub struct ComputeMetrics {
+    /// Total number of tasks executed
+    tasks_total: AtomicU64,
+
+    /// Number of tasks currently running
+    tasks_active: AtomicUsize,
+
+    /// Total time spent in compute tasks (microseconds)
+    total_compute_time_us: AtomicU64,
+
+    /// Maximum task duration seen (microseconds)
+    max_task_duration_us: AtomicU64,
+
+    /// Number of tasks that took longer than 100ms
+    slow_tasks: AtomicU64,
+}
+
+impl ComputeMetrics {
+    /// Create new metrics instance
+    pub fn new() -> Self {
+        Self {
+            tasks_total: AtomicU64::new(0),
+            tasks_active: AtomicUsize::new(0),
+            total_compute_time_us: AtomicU64::new(0),
+            max_task_duration_us: AtomicU64::new(0),
+            slow_tasks: AtomicU64::new(0),
+        }
+    }
+
+    /// Record that a task has started
+    pub fn record_task_start(&self) {
+        self.tasks_active.fetch_add(1, Ordering::Relaxed);
+    }
+
+    /// Record that a task has completed
+    pub fn record_task_completion(&self, duration: Duration) {
+        self.tasks_active.fetch_sub(1, Ordering::Relaxed);
+        self.tasks_total.fetch_add(1, Ordering::Relaxed);
+
+        // Use saturating conversion to prevent overflow
+        let duration_us = duration.as_micros().min(u64::MAX as u128) as u64;
+        self.total_compute_time_us
+            .fetch_add(duration_us, Ordering::Relaxed);
+
+        // Update max duration
+        let mut current_max = self.max_task_duration_us.load(Ordering::Relaxed);
+        while duration_us > current_max {
+            match self.max_task_duration_us.compare_exchange_weak(
+                current_max,
+                duration_us,
+                Ordering::SeqCst,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => current_max = x,
+            }
+        }
+
+        // Track slow tasks (> 100ms)
+        if duration.as_millis() > 100 {
+            self.slow_tasks.fetch_add(1, Ordering::Relaxed);
+        }
+    }
+
+    /// Get total number of tasks executed
+    pub fn tasks_total(&self) -> u64 {
+        self.tasks_total.load(Ordering::Relaxed)
+    }
+
+    /// Get number of currently active tasks
+    pub fn tasks_active(&self) -> usize {
+        self.tasks_active.load(Ordering::Relaxed)
+    }
+
+    /// Get average task duration in microseconds
+    pub fn avg_task_duration_us(&self) -> f64 {
+        let total = self.tasks_total.load(Ordering::Relaxed);
+        if total == 0 {
+            return 0.0;
+        }
+
+        let total_time = self.total_compute_time_us.load(Ordering::Relaxed);
+        total_time as f64 / total as f64
+    }
+
+    /// Get maximum task duration in microseconds
+    pub fn max_task_duration_us(&self) -> u64 {
+        self.max_task_duration_us.load(Ordering::Relaxed)
+    }
+
+    /// Get number of slow tasks (> 100ms)
+    pub fn slow_tasks(&self) -> u64 {
+        self.slow_tasks.load(Ordering::Relaxed)
+    }
+
+    /// Reset all metrics
+    pub fn reset(&self) {
+        self.tasks_total.store(0, Ordering::Relaxed);
+        self.tasks_active.store(0, Ordering::Relaxed);
+        self.total_compute_time_us.store(0, Ordering::Relaxed);
+        self.max_task_duration_us.store(0, Ordering::Relaxed);
+        self.slow_tasks.store(0, Ordering::Relaxed);
+    }
+}
+
+impl Default for ComputeMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Format metrics as a human-readable string
+impl std::fmt::Display for ComputeMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "ComputeMetrics {{ tasks_total: {}, tasks_active: {}, avg_duration_ms: {:.2}, max_duration_ms: {:.2}, slow_tasks: {} }}",
+            self.tasks_total(),
+            self.tasks_active(),
+            self.avg_task_duration_us() / 1000.0,
+            self.max_task_duration_us() as f64 / 1000.0,
+            self.slow_tasks(),
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_metrics_recording() {
+        let metrics = ComputeMetrics::new();
+
+        assert_eq!(metrics.tasks_total(), 0);
+        assert_eq!(metrics.tasks_active(), 0);
+
+        metrics.record_task_start();
+        assert_eq!(metrics.tasks_active(), 1);
+
+        metrics.record_task_completion(Duration::from_millis(50));
+        assert_eq!(metrics.tasks_active(), 0);
+        assert_eq!(metrics.tasks_total(), 1);
+        assert_eq!(metrics.slow_tasks(), 0);
+
+        metrics.record_task_start();
+        metrics.record_task_completion(Duration::from_millis(150));
+        assert_eq!(metrics.tasks_total(), 2);
+        assert_eq!(metrics.slow_tasks(), 1);
+    }
+
+    #[test]
+    fn test_metrics_reset() {
+        let metrics = ComputeMetrics::new();
+
+        metrics.record_task_start();
+        metrics.record_task_completion(Duration::from_millis(50));
+        assert_eq!(metrics.tasks_total(), 1);
+
+        metrics.reset();
+        assert_eq!(metrics.tasks_total(), 0);
+        assert_eq!(metrics.tasks_active(), 0);
+    }
+}
--- a/lib/runtime/src/compute/mod.rs
+++ b/lib/runtime/src/compute/mod.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Compute module for CPU-intensive operations using Rayon
+//!
+//! This module provides a dedicated compute thread pool for CPU-bound work,
+//! integrating Rayon's fork-join parallelism with Tokio's async runtime.
+//!
+//! Key features:
+//! - Dedicated Rayon thread pool for compute operations
+//! - Seamless async-to-sync bridging via tokio-rayon
+//! - Scope-based parallelism for complex computational graphs
+//! - Metrics and monitoring for compute operations
+//!
+#![doc = include_str!("../../docs/rayon-tokio-strategy.md")]
+
+use anyhow::Result;
+use rayon::ThreadPoolBuilder;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::time::Instant;
+
+pub mod macros;
+pub mod metrics;
+pub mod pool;
+pub mod thread_local;
+#[cfg(feature = "compute-validation")]
+pub mod validation;
+
+pub use metrics::ComputeMetrics;
+pub use pool::{ComputeHandle, ComputePool, ComputePoolExt};
+
+/// Configuration for the compute thread pool
+#[derive(Debug, Clone)]
+pub struct ComputeConfig {
+    /// Number of threads in the Rayon pool (defaults to num_cpus / 2)
+    pub num_threads: Option<usize>,
+
+    /// Stack size for compute threads (defaults to 2MB)
+    pub stack_size: Option<usize>,
+
+    /// Thread name prefix (defaults to "compute")
+    pub thread_prefix: String,
+
+    /// Whether to pin threads to CPU cores
+    pub pin_threads: bool,
+}
+
+impl Default for ComputeConfig {
+    fn default() -> Self {
+        Self {
+            num_threads: None,                 // Will use num_cpus / 2
+            stack_size: Some(2 * 1024 * 1024), // 2MB
+            thread_prefix: "compute".to_string(),
+            pin_threads: false,
+        }
+    }
+}
+
+impl ComputeConfig {
+    /// Validate the configuration
+    pub fn validate(&self) -> Result<()> {
+        if let Some(num_threads) = self.num_threads
+            && num_threads == 0
+        {
+            return Err(anyhow::anyhow!(
+                "Number of compute threads cannot be 0. Use None to disable compute pool entirely."
+            ));
+        }
+
+        if let Some(stack_size) = self.stack_size
+            && stack_size < 128 * 1024
+        {
+            return Err(anyhow::anyhow!(
+                "Stack size too small: {}KB. Minimum recommended: 128KB",
+                stack_size / 1024
+            ));
+        }
+
+        Ok(())
+    }
+
+    /// Create a ThreadPoolBuilder from this configuration
+    pub(crate) fn build_pool(&self) -> Result<rayon::ThreadPool> {
+        // Validate configuration first
+        self.validate()?;
+
+        let mut builder = ThreadPoolBuilder::new();
+
+        // Set number of threads with better logic for minimum parallelism
+        let num_threads = self.num_threads.unwrap_or_else(|| {
+            std::thread::available_parallelism()
+                .map(|n| {
+                    let total_cores = n.get();
+                    // Use half the cores, but ensure we have at least 2 threads
+                    // for meaningful parallelism, and cap at 16 for efficiency
+                    (total_cores / 2).clamp(2, 16)
+                })
+                .unwrap_or(2) // Fallback to 2 threads if detection fails
+        });
+        builder = builder.num_threads(num_threads);
+
+        // Set stack size if specified
+        if let Some(stack_size) = self.stack_size {
+            builder = builder.stack_size(stack_size);
+        }
+
+        // Set thread name prefix
+        let prefix = self.thread_prefix.clone();
+        let thread_counter = Arc::new(AtomicU64::new(0));
+        builder = builder.thread_name(move |_| {
+            let id = thread_counter.fetch_add(1, Ordering::SeqCst);
+            format!("{}-{}", prefix, id)
+        });
+
+        // TODO: Add CPU pinning if requested
+        // if self.pin_threads {
+        //     builder = builder.start_handler(|idx| {
+        //         // Pin thread to CPU core
+        //     });
+        // }
+
+        builder
+            .build()
+            .map_err(|e| anyhow::anyhow!("Failed to create Rayon thread pool: {}", e))
+    }
+}
+
+/// Helper trait for scope-based operations
+pub trait ScopeExecutor {
+    /// Execute a function within a Rayon scope
+    fn execute_in_scope<F, R>(&self, f: F) -> R
+    where
+        F: FnOnce(&rayon::Scope) -> R + Send,
+        R: Send;
+}
+
+/// Helper functions for common parallel patterns
+pub mod patterns {
+    use super::*;
+
+    /// Execute two functions in parallel and return both results
+    pub async fn parallel_join<F1, F2, R1, R2>(
+        pool: &ComputePool,
+        f1: F1,
+        f2: F2,
+    ) -> Result<(R1, R2)>
+    where
+        F1: FnOnce() -> R1 + Send + 'static,
+        F2: FnOnce() -> R2 + Send + 'static,
+        R1: Send + 'static,
+        R2: Send + 'static,
+    {
+        pool.execute(move || rayon::join(f1, f2)).await
+    }
+
+    /// Execute multiple functions in parallel using scope
+    pub async fn parallel_map<F, T, R>(pool: &ComputePool, items: Vec<T>, f: F) -> Result<Vec<R>>
+    where
+        F: Fn(T) -> R + Sync + Send + 'static,
+        T: Send + 'static,
+        R: Send + 'static,
+    {
+        use rayon::prelude::*;
+        pool.execute(move || items.into_par_iter().map(f).collect())
+            .await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_compute_config_default() {
+        let config = ComputeConfig::default();
+        assert_eq!(config.thread_prefix, "compute");
+        assert_eq!(config.stack_size, Some(2 * 1024 * 1024));
+        assert!(!config.pin_threads);
+    }
+
+    #[test]
+    fn test_build_pool() {
+        let config = ComputeConfig {
+            num_threads: Some(2),
+            ..Default::default()
+        };
+
+        let pool = config.build_pool().unwrap();
+        assert_eq!(pool.current_num_threads(), 2);
+    }
+}
--- a/lib/runtime/src/compute/pool.rs
+++ b/lib/runtime/src/compute/pool.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Compute pool implementation with tokio-rayon integration
+//!
+//! The `ComputePool` allows multiple async tasks to concurrently submit different
+//! types of parallel work to a shared Rayon thread pool. This enables efficient
+//! CPU utilization without manual thread management.
+//!
+//! # Concurrent Usage Example
+//!
+//! ```ignore
+//! use std::sync::Arc;
+//! use dynamo_runtime::compute::ComputePool;
+//! use rayon::prelude::*;
+//!
+//! async fn concurrent_processing(pool: Arc<ComputePool>) {
+//!     // Task 1: Using scope for dynamic task generation
+//!     let task1 = tokio::spawn({
+//!         let pool = pool.clone();
+//!         async move {
+//!             pool.execute_scoped(|scope| {
+//!                 // Dynamically spawn tasks based on runtime conditions
+//!                 for i in 0..100 {
+//!                     scope.spawn(move |_| {
+//!                         // CPU-intensive work
+//!                         let mut sum = 0u64;
+//!                         for j in 0..1000 {
+//!                             sum += (i * j) as u64;
+//!                         }
+//!                         sum
+//!                     });
+//!                 }
+//!             }).await
+//!         }
+//!     });
+//!
+//!     // Task 2: Using parallel iterators for batch processing
+//!     let task2 = tokio::spawn({
+//!         let pool = pool.clone();
+//!         async move {
+//!             let data: Vec<u32> = (0..10000).collect();
+//!             pool.install(|| {
+//!                 data.par_chunks(100)
+//!                     .map(|chunk| chunk.iter().sum::<u32>())
+//!                     .collect::<Vec<_>>()
+//!             }).await
+//!         }
+//!     });
+//!
+//!     // Both tasks run concurrently, sharing the same thread pool
+//!     let (result1, result2) = tokio::join!(task1, task2);
+//! }
+//! ```
+//!
+//! # Thread Pool Sharing
+//!
+//! The Rayon thread pool uses work-stealing to efficiently distribute work from
+//! multiple concurrent sources:
+//!
+//! - Tasks from `scope.spawn()` are pushed to thread-local deques
+//! - Parallel iterators distribute work across all threads
+//! - Idle threads steal work from busy threads
+//! - No coordination needed between different parallelization patterns
+
+use super::{ComputeConfig, ComputeMetrics};
+use anyhow::Result;
+use async_trait::async_trait;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+/// A compute pool that manages CPU-intensive operations
+#[derive(Clone)]
+pub struct ComputePool {
+    /// The underlying Rayon thread pool
+    pool: Arc<rayon::ThreadPool>,
+
+    /// Metrics for monitoring compute operations
+    metrics: Arc<ComputeMetrics>,
+
+    /// Configuration used to create this pool
+    config: ComputeConfig,
+}
+
+impl std::fmt::Debug for ComputePool {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ComputePool")
+            .field("num_threads", &self.pool.current_num_threads())
+            .field("metrics", &self.metrics)
+            .field("config", &self.config)
+            .finish()
+    }
+}
+
+impl ComputePool {
+    /// Create a new compute pool with the given configuration
+    pub fn new(config: ComputeConfig) -> Result<Self> {
+        let pool = config.build_pool()?;
+        let metrics = Arc::new(ComputeMetrics::new());
+
+        Ok(Self {
+            pool: Arc::new(pool),
+            metrics,
+            config,
+        })
+    }
+
+    /// Create a compute pool with default configuration
+    pub fn with_defaults() -> Result<Self> {
+        Self::new(ComputeConfig::default())
+    }
+
+    /// Execute a synchronous computation on the thread pool
+    ///
+    /// This method is designed to be called from within `spawn_blocking` or other
+    /// synchronous contexts. It has minimal overhead as it directly uses Rayon
+    /// without the async bridge.
+    ///
+    /// # Example
+    /// ```ignore
+    /// # use dynamo_runtime::compute::ComputePool;
+    /// # let pool = ComputePool::new(Default::default()).unwrap();
+    /// tokio::task::spawn_blocking(move || {
+    ///     pool.execute_sync(|| {
+    ///         // CPU-intensive work
+    ///         expensive_computation()
+    ///     })
+    /// });
+    /// ```
+    pub fn execute_sync<F, R>(&self, f: F) -> R
+    where
+        F: FnOnce() -> R + Send,
+        R: Send,
+    {
+        self.pool.install(f)
+    }
+
+    /// Execute a compute task in the Rayon pool
+    ///
+    /// This bridges from async context to the Rayon thread pool,
+    /// allowing CPU-intensive work to run without blocking Tokio workers.
+    ///
+    /// Note: This method has ~25μs overhead for small tasks due to the async
+    /// channel communication. For very small computations (<100μs), consider
+    /// running directly on Tokio or using `spawn_blocking` with `execute_sync`.
+    pub async fn execute<F, R>(&self, f: F) -> Result<R>
+    where
+        F: FnOnce() -> R + Send + 'static,
+        R: Send + 'static,
+    {
+        self.metrics.record_task_start();
+        let start = std::time::Instant::now();
+
+        // Use tokio-rayon to bridge to the compute pool
+        let pool = self.pool.clone();
+        let result = tokio_rayon::spawn(move || pool.install(f)).await;
+
+        self.metrics.record_task_completion(start.elapsed());
+        Ok(result)
+    }
+
+    /// Execute a function with a Rayon scope
+    ///
+    /// This allows spawning multiple parallel tasks within the scope,
+    /// with the guarantee that all tasks complete before returning.
+    pub async fn execute_scoped<F, R>(&self, f: F) -> Result<R>
+    where
+        F: FnOnce(&rayon::Scope) -> R + Send + 'static,
+        R: Send + 'static,
+    {
+        self.metrics.record_task_start();
+        let start = std::time::Instant::now();
+
+        let pool = self.pool.clone();
+        let result = tokio_rayon::spawn(move || {
+            pool.install(|| {
+                let mut result = None;
+                rayon::scope(|s| {
+                    result = Some(f(s));
+                });
+                result.unwrap()
+            })
+        })
+        .await;
+
+        self.metrics.record_task_completion(start.elapsed());
+        Ok(result)
+    }
+
+    /// Execute a function with a FIFO scope
+    ///
+    /// Similar to execute_scoped, but tasks are prioritized in FIFO order
+    /// rather than the default LIFO order.
+    pub async fn execute_scoped_fifo<F, R>(&self, f: F) -> Result<R>
+    where
+        F: FnOnce(&rayon::ScopeFifo) -> R + Send + 'static,
+        R: Send + 'static,
+    {
+        self.metrics.record_task_start();
+        let start = std::time::Instant::now();
+
+        let pool = self.pool.clone();
+        let result = tokio_rayon::spawn(move || {
+            pool.install(|| {
+                let mut result = None;
+                rayon::scope_fifo(|s| {
+                    result = Some(f(s));
+                });
+                result.unwrap()
+            })
+        })
+        .await;
+
+        self.metrics.record_task_completion(start.elapsed());
+        Ok(result)
+    }
+
+    /// Join two computations in parallel
+    pub async fn join<F1, F2, R1, R2>(&self, f1: F1, f2: F2) -> Result<(R1, R2)>
+    where
+        F1: FnOnce() -> R1 + Send + 'static,
+        F2: FnOnce() -> R2 + Send + 'static,
+        R1: Send + 'static,
+        R2: Send + 'static,
+    {
+        self.execute(move || rayon::join(f1, f2)).await
+    }
+
+    /// Get metrics for this compute pool
+    pub fn metrics(&self) -> &ComputeMetrics {
+        &self.metrics
+    }
+
+    /// Get the number of threads in the pool
+    pub fn num_threads(&self) -> usize {
+        self.pool.current_num_threads()
+    }
+
+    /// Install this pool as the Rayon pool for the given closure
+    ///
+    /// This method is essential for using Rayon's parallel iterators (like `par_iter`,
+    /// `par_chunks`, etc.) with this specific thread pool. Any parallel iterator
+    /// operations within the closure will execute on this pool's threads.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// use rayon::prelude::*;
+    ///
+    /// // Process data using parallel iterators
+    /// let result = pool.install(|| {
+    ///     data.par_chunks(100)
+    ///         .map(|chunk| process_chunk(chunk))
+    ///         .collect::<Vec<_>>()
+    /// }).await?;
+    /// ```
+    ///
+    /// # Concurrent Usage
+    ///
+    /// Multiple async tasks can call `install()` concurrently on the same pool.
+    /// The Rayon work-stealing scheduler will efficiently distribute work from
+    /// all concurrent operations:
+    ///
+    /// ```ignore
+    /// // These can run concurrently without interference
+    /// let task1 = pool.install(|| data1.par_iter().map(f1).collect());
+    /// let task2 = pool.install(|| data2.par_chunks(50).map(f2).collect());
+    /// ```
+    pub async fn install<F, R>(&self, f: F) -> Result<R>
+    where
+        F: FnOnce() -> R + Send + 'static,
+        R: Send + 'static,
+    {
+        let pool = self.pool.clone();
+        self.metrics.record_task_start();
+        let start = std::time::Instant::now();
+
+        let result = tokio_rayon::spawn(move || pool.install(f)).await;
+
+        self.metrics.record_task_completion(start.elapsed());
+        Ok(result)
+    }
+}
+
+/// A handle to a compute task that's currently running
+pub struct ComputeHandle<T> {
+    inner: Pin<Box<dyn Future<Output = T> + Send>>,
+}
+
+impl<T> ComputeHandle<T> {
+    /// Create a new compute handle from a future
+    pub(crate) fn new<F>(future: F) -> Self
+    where
+        F: Future<Output = T> + Send + 'static,
+    {
+        Self {
+            inner: Box::pin(future),
+        }
+    }
+}
+
+impl<T> Future for ComputeHandle<T> {
+    type Output = T;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.inner.as_mut().poll(cx)
+    }
+}
+
+/// Extension trait for ComputePool with additional patterns
+#[async_trait]
+pub trait ComputePoolExt {
+    /// Process items in parallel batches
+    async fn parallel_batch<T, F, R>(
+        &self,
+        items: Vec<T>,
+        batch_size: usize,
+        f: F,
+    ) -> Result<Vec<R>>
+    where
+        T: Send + Sync + 'static,
+        F: Fn(&[T]) -> Vec<R> + Send + Sync + 'static,
+        R: Send + 'static;
+
+    /// Map over items in parallel using Rayon's par_iter
+    async fn parallel_map<T, F, R>(&self, items: Vec<T>, f: F) -> Result<Vec<R>>
+    where
+        T: Send + Sync + 'static,
+        F: Fn(T) -> R + Send + Sync + 'static,
+        R: Send + 'static;
+}
+
+#[async_trait]
+impl ComputePoolExt for ComputePool {
+    async fn parallel_batch<T, F, R>(
+        &self,
+        items: Vec<T>,
+        batch_size: usize,
+        f: F,
+    ) -> Result<Vec<R>>
+    where
+        T: Send + Sync + 'static,
+        F: Fn(&[T]) -> Vec<R> + Send + Sync + 'static,
+        R: Send + 'static,
+    {
+        use rayon::prelude::*;
+
+        self.install(move || items.par_chunks(batch_size).flat_map(f).collect())
+            .await
+    }
+
+    async fn parallel_map<T, F, R>(&self, items: Vec<T>, f: F) -> Result<Vec<R>>
+    where
+        T: Send + Sync + 'static,
+        F: Fn(T) -> R + Send + Sync + 'static,
+        R: Send + 'static,
+    {
+        use rayon::prelude::*;
+
+        self.install(move || items.into_par_iter().map(f).collect())
+            .await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Mutex;
+
+    #[tokio::test]
+    async fn test_compute_pool_execute() {
+        let pool = ComputePool::with_defaults().unwrap();
+
+        let result = pool
+            .execute(|| {
+                // Simulate CPU-intensive work
+                let mut sum = 0u64;
+                for i in 0..1000 {
+                    sum += i;
+                }
+                sum
+            })
+            .await
+            .unwrap();
+
+        assert_eq!(result, 499500);
+    }
+
+    #[tokio::test]
+    async fn test_compute_pool_join() {
+        let pool = ComputePool::with_defaults().unwrap();
+
+        let (a, b) = pool.join(|| 2 + 2, || 3 * 3).await.unwrap();
+
+        assert_eq!(a, 4);
+        assert_eq!(b, 9);
+    }
+
+    #[tokio::test]
+    async fn test_compute_pool_execute_sync() {
+        let pool = Arc::new(ComputePool::with_defaults().unwrap());
+
+        // Test using execute_sync from spawn_blocking
+        let pool_clone = pool.clone();
+        let result = tokio::task::spawn_blocking(move || {
+            pool_clone.execute_sync(|| {
+                let mut sum = 0u64;
+                for i in 0..1000 {
+                    sum += i;
+                }
+                sum
+            })
+        })
+        .await
+        .unwrap();
+
+        assert_eq!(result, 499500);
+    }
+
+    #[tokio::test]
+    async fn test_compute_pool_scoped() {
+        use std::sync::mpsc;
+
+        let pool = ComputePool::with_defaults().unwrap();
+
+        let mut result = pool
+            .execute_scoped(|scope| {
+                let (tx, rx) = mpsc::channel();
+
+                for i in 0..4 {
+                    let tx = tx.clone();
+                    scope.spawn(move |_| {
+                        tx.send((i, i * 2)).unwrap();
+                    });
+                }
+
+                drop(tx); // Close sender so receiver can finish
+
+                let mut results = vec![0; 4];
+                for (i, val) in rx {
+                    results[i] = val;
+                }
+                results
+            })
+            .await
+            .unwrap();
+
+        // Results may be in any order due to parallel execution
+        result.sort();
+        assert_eq!(result, vec![0, 2, 4, 6]);
+    }
+}
--- a/lib/runtime/src/compute/thread_local.rs
+++ b/lib/runtime/src/compute/thread_local.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Thread-local storage for compute resources
+//!
+//! This module provides thread-local access to compute resources (Rayon pool and semaphore)
+//! for Tokio worker threads. This eliminates the need to pass Runtime or ComputePool
+//! references through async function calls.
+
+use super::ComputePool;
+use std::cell::RefCell;
+use std::sync::Arc;
+use tokio::sync::Semaphore;
+
+thread_local! {
+    /// Thread-local compute context available on Tokio worker threads
+    static COMPUTE_CONTEXT: RefCell<Option<ComputeContext>> = const { RefCell::new(None) };
+}
+
+/// Compute resources available to a Tokio worker thread
+#[derive(Clone)]
+pub struct ComputeContext {
+    /// The Rayon compute pool
+    pub pool: Arc<ComputePool>,
+    /// Semaphore for block_in_place permits
+    pub block_in_place_permits: Arc<Semaphore>,
+}
+
+/// Initialize the thread-local compute context
+///
+/// This should be called from the Tokio runtime's `on_thread_start` callback
+pub fn initialize_context(pool: Arc<ComputePool>, permits: Arc<Semaphore>) {
+    COMPUTE_CONTEXT.with(|ctx| {
+        *ctx.borrow_mut() = Some(ComputeContext {
+            pool,
+            block_in_place_permits: permits,
+        });
+    });
+}
+
+/// Access the thread-local compute context
+///
+/// Returns None if called from a non-worker thread or if context wasn't initialized
+pub fn with_context<F, R>(f: F) -> Option<R>
+where
+    F: FnOnce(&ComputeContext) -> R,
+{
+    COMPUTE_CONTEXT.with(|ctx| ctx.borrow().as_ref().map(f))
+}
+
+/// Try to acquire a block_in_place permit from thread-local context
+///
+/// Returns Ok(permit) if successful, Err if no context or no permits available
+pub fn try_acquire_block_permit() -> Result<tokio::sync::OwnedSemaphorePermit, &'static str> {
+    with_context(|ctx| {
+        ctx.block_in_place_permits
+            .clone()
+            .try_acquire_owned()
+            .map_err(|_| "No permits available")
+    })
+    .ok_or("No compute context on this thread")?
+}
+
+/// Get the compute pool from thread-local context
+///
+/// Returns None if called from a non-worker thread
+pub fn get_pool() -> Option<Arc<ComputePool>> {
+    with_context(|ctx| ctx.pool.clone())
+}
+
+/// Check if the current thread has compute context initialized
+///
+/// Returns true if the thread-local context is initialized with a compute pool
+/// and semaphore permits, meaning the compute macros will offload work.
+/// Returns false if macros would fall back to inline execution.
+pub fn has_compute_context() -> bool {
+    with_context(|_| ()).is_some()
+}
+
+/// Assert that the current thread has compute context initialized
+///
+/// Panics if the thread-local context is not initialized.
+/// Use this to ensure compute macros will offload work rather than run inline.
+pub fn assert_compute_context() {
+    if !has_compute_context() {
+        panic!(
+            "Thread-local compute context not initialized! \
+             Compute macros will fall back to inline execution. \
+             Call Runtime::initialize_thread_local() on worker threads."
+        );
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_uninitialized_context() {
+        // Should return None when context not initialized
+        assert!(get_pool().is_none());
+        assert!(try_acquire_block_permit().is_err());
+        assert!(!has_compute_context());
+    }
+
+    #[test]
+    #[should_panic(expected = "Thread-local compute context not initialized")]
+    fn test_assert_compute_context_panics() {
+        // Should panic when context not initialized
+        assert_compute_context();
+    }
+}
--- a/lib/runtime/src/compute/validation.rs
+++ b/lib/runtime/src/compute/validation.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Validation module for compute task timing
+//!
+//! This module is only compiled when the `compute-validation` feature is enabled.
+//! It provides functions to validate that compute tasks are correctly classified
+//! as small, medium, or large based on their execution time.
+
+#[cfg(feature = "compute-validation")]
+use std::sync::atomic::{AtomicU64, Ordering};
+#[cfg(feature = "compute-validation")]
+use std::time::Duration;
+#[cfg(feature = "compute-validation")]
+use tracing::warn;
+
+/// Threshold for small tasks in microseconds (<100μs)
+#[cfg(feature = "compute-validation")]
+pub const SMALL_THRESHOLD_US: u64 = 100;
+
+/// Threshold for medium tasks in microseconds (100μs - 1ms)
+#[cfg(feature = "compute-validation")]
+pub const MEDIUM_THRESHOLD_US: u64 = 1000;
+
+// Metrics counters for misclassified tasks
+#[cfg(feature = "compute-validation")]
+static SMALL_MISCLASSIFIED: AtomicU64 = AtomicU64::new(0);
+#[cfg(feature = "compute-validation")]
+static MEDIUM_MISCLASSIFIED: AtomicU64 = AtomicU64::new(0);
+#[cfg(feature = "compute-validation")]
+static LARGE_MISCLASSIFIED: AtomicU64 = AtomicU64::new(0);
+
+/// Validate that a task classified as small actually completed within the small threshold
+#[cfg(feature = "compute-validation")]
+pub fn validate_small(elapsed: Duration) {
+    let micros = elapsed.as_micros() as u64;
+    if micros > SMALL_THRESHOLD_US {
+        SMALL_MISCLASSIFIED.fetch_add(1, Ordering::Relaxed);
+        warn!(
+            task_duration_us = micros,
+            threshold_us = SMALL_THRESHOLD_US,
+            "compute_small! task exceeded threshold. Consider using compute_medium!"
+        );
+    }
+}
+
+/// Validate that a task classified as medium is within the medium range
+#[cfg(feature = "compute-validation")]
+pub fn validate_medium(elapsed: Duration) {
+    let micros = elapsed.as_micros() as u64;
+    if micros < SMALL_THRESHOLD_US {
+        MEDIUM_MISCLASSIFIED.fetch_add(1, Ordering::Relaxed);
+        warn!(
+            task_duration_us = micros,
+            threshold_us = SMALL_THRESHOLD_US,
+            "compute_medium! task below small threshold. Consider using compute_small!"
+        );
+    } else if micros > MEDIUM_THRESHOLD_US {
+        MEDIUM_MISCLASSIFIED.fetch_add(1, Ordering::Relaxed);
+        warn!(
+            task_duration_us = micros,
+            threshold_us = MEDIUM_THRESHOLD_US,
+            "compute_medium! task exceeded threshold. Consider using compute_large!"
+        );
+    }
+}
+
+/// Validate that a task classified as large actually needed offloading
+#[cfg(feature = "compute-validation")]
+pub fn validate_large(elapsed: Duration) {
+    let micros = elapsed.as_micros() as u64;
+    if micros < MEDIUM_THRESHOLD_US {
+        LARGE_MISCLASSIFIED.fetch_add(1, Ordering::Relaxed);
+        warn!(
+            task_duration_us = micros,
+            threshold_us = MEDIUM_THRESHOLD_US,
+            "compute_large! task below medium threshold. Consider using compute_medium! or compute_small!"
+        );
+    }
+}
+
+/// Get metrics about misclassified tasks
+#[cfg(feature = "compute-validation")]
+pub fn get_misclassification_metrics() -> (u64, u64, u64) {
+    (
+        SMALL_MISCLASSIFIED.load(Ordering::Relaxed),
+        MEDIUM_MISCLASSIFIED.load(Ordering::Relaxed),
+        LARGE_MISCLASSIFIED.load(Ordering::Relaxed),
+    )
+}
+
+/// Reset misclassification metrics
+#[cfg(feature = "compute-validation")]
+pub fn reset_misclassification_metrics() {
+    SMALL_MISCLASSIFIED.store(0, Ordering::Relaxed);
+    MEDIUM_MISCLASSIFIED.store(0, Ordering::Relaxed);
+    LARGE_MISCLASSIFIED.store(0, Ordering::Relaxed);
+}
--- a/lib/runtime/src/config.rs
+++ b/lib/runtime/src/config.rs
@@ -131,6 +131,26 @@ pub struct RuntimeConfig {
    #[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
    pub system_live_path: String,

+    /// Number of threads for the Rayon compute pool
+    /// If not set, defaults to num_cpus / 2
+    /// Set this at runtime with environment variable DYN_COMPUTE_THREADS
+    #[builder(default = "None")]
+    #[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
+    pub compute_threads: Option<usize>,
+
+    /// Stack size for compute threads in bytes
+    /// Defaults to 2MB (2097152 bytes)
+    /// Set this at runtime with environment variable DYN_COMPUTE_STACK_SIZE
+    #[builder(default = "Some(2 * 1024 * 1024)")]
+    #[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
+    pub compute_stack_size: Option<usize>,
+
+    /// Thread name prefix for compute pool threads
+    /// Set this at runtime with environment variable DYN_COMPUTE_THREAD_PREFIX
+    #[builder(default = "\"compute\".to_string()")]
+    #[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
+    pub compute_thread_prefix: String,
+
    /// Enable active health checking with payloads
    /// Set this at runtime with environment variable DYN_HEALTH_CHECK_ENABLED
    #[builder(default = "false")]
@@ -225,6 +245,23 @@ impl RuntimeConfig {
                    _ => None,
                }
            }))
+            .merge(Env::prefixed("DYN_COMPUTE_").filter_map(|k| {
+                let full_key = format!("DYN_COMPUTE_{}", k.as_str());
+                // filters out empty environment variables
+                match std::env::var(&full_key) {
+                    Ok(v) if !v.is_empty() => {
+                        // Map DYN_COMPUTE_* to the correct field names
+                        let mapped_key = match k.as_str() {
+                            "THREADS" => "compute_threads",
+                            "STACK_SIZE" => "compute_stack_size",
+                            "THREAD_PREFIX" => "compute_thread_prefix",
+                            _ => k.as_str(),
+                        };
+                        Some(mapped_key.into())
+                    }
+                    _ => None,
+                }
+            }))
            .merge(Env::prefixed("DYN_HEALTH_CHECK_").filter_map(|k| {
                let full_key = format!("DYN_HEALTH_CHECK_{}", k.as_str());
                // filters out empty environment variables
@@ -298,6 +335,9 @@ impl RuntimeConfig {
            use_endpoint_health_status: vec![],
            system_health_path: DEFAULT_SYSTEM_HEALTH_PATH.to_string(),
            system_live_path: DEFAULT_SYSTEM_LIVE_PATH.to_string(),
+            compute_threads: Some(1),
+            compute_stack_size: Some(2 * 1024 * 1024),
+            compute_thread_prefix: "compute".to_string(),
            health_check_enabled: false,
            canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
            health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
@@ -330,6 +370,9 @@ impl Default for RuntimeConfig {
            use_endpoint_health_status: vec![],
            system_health_path: DEFAULT_SYSTEM_HEALTH_PATH.to_string(),
            system_live_path: DEFAULT_SYSTEM_LIVE_PATH.to_string(),
+            compute_threads: None,
+            compute_stack_size: Some(2 * 1024 * 1024),
+            compute_thread_prefix: "compute".to_string(),
            health_check_enabled: false,
            canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
            health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,

--- a/lib/runtime/src/lib.rs
+++ b/lib/runtime/src/lib.rs
@@ -21,6 +21,7 @@ mod config;
 pub use config::RuntimeConfig;

 pub mod component;
+pub mod compute;
 pub mod discovery;
 pub mod engine;
 pub mod health_check;
@@ -73,6 +74,8 @@ pub struct Runtime {
    cancellation_token: CancellationToken,
    endpoint_shutdown_token: CancellationToken,
    graceful_shutdown_tracker: Arc<GracefulShutdownTracker>,
+    compute_pool: Option<Arc<compute::ComputePool>>,
+    block_in_place_permits: Option<Arc<tokio::sync::Semaphore>>,
 }

 /// Type alias for runtime callback functions to reduce complexity

--- a/lib/runtime/src/runtime.rs
+++ b/lib/runtime/src/runtime.rs
@@ -44,6 +44,11 @@ impl Runtime {
            }
        };

+        // Initialize compute pool with default config
+        // This will be properly configured when created from RuntimeConfig
+        let compute_pool = None;
+        let block_in_place_permits = None;
+
        Ok(Runtime {
            id,
            primary: runtime,
@@ -51,9 +56,157 @@ impl Runtime {
            cancellation_token,
            endpoint_shutdown_token,
            graceful_shutdown_tracker: Arc::new(GracefulShutdownTracker::new()),
+            compute_pool,
+            block_in_place_permits,
        })
    }

+    fn new_with_config(
+        runtime: RuntimeType,
+        secondary: Option<RuntimeType>,
+        config: &RuntimeConfig,
+    ) -> Result<Runtime> {
+        let mut rt = Self::new(runtime, secondary)?;
+
+        // Create compute pool from configuration
+        let compute_config = crate::compute::ComputeConfig {
+            num_threads: config.compute_threads,
+            stack_size: config.compute_stack_size,
+            thread_prefix: config.compute_thread_prefix.clone(),
+            pin_threads: false,
+        };
+
+        // Check if compute pool is explicitly disabled
+        if config.compute_threads == Some(0) {
+            tracing::info!("Compute pool disabled (compute_threads = 0)");
+        } else {
+            match crate::compute::ComputePool::new(compute_config) {
+                Ok(pool) => {
+                    rt.compute_pool = Some(Arc::new(pool));
+                    tracing::debug!(
+                        "Initialized compute pool with {} threads",
+                        rt.compute_pool.as_ref().unwrap().num_threads()
+                    );
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        "Failed to create compute pool: {}. CPU-intensive operations will use spawn_blocking",
+                        e
+                    );
+                }
+            }
+        }
+
+        // Initialize block_in_place semaphore based on actual worker threads
+        let num_workers = config
+            .num_worker_threads
+            .unwrap_or_else(|| std::thread::available_parallelism().unwrap().get());
+        // Reserve at least one thread for async work
+        let permits = num_workers.saturating_sub(1).max(1);
+        rt.block_in_place_permits = Some(Arc::new(tokio::sync::Semaphore::new(permits)));
+        tracing::debug!(
+            "Initialized block_in_place permits: {} (from {} worker threads)",
+            permits,
+            num_workers
+        );
+
+        Ok(rt)
+    }
+
+    /// Initialize thread-local compute context on the current thread
+    /// This should be called on each Tokio worker thread
+    pub fn initialize_thread_local(&self) {
+        if let (Some(pool), Some(permits)) = (&self.compute_pool, &self.block_in_place_permits) {
+            crate::compute::thread_local::initialize_context(Arc::clone(pool), Arc::clone(permits));
+        }
+    }
+
+    /// Initialize thread-local compute context on all worker threads using a barrier
+    /// This ensures every worker thread has its thread-local context initialized
+    pub async fn initialize_all_thread_locals(&self) -> Result<()> {
+        if let (Some(pool), Some(permits)) = (&self.compute_pool, &self.block_in_place_permits) {
+            // First, detect how many worker threads we actually have
+            let num_workers = self.detect_worker_thread_count().await;
+
+            if num_workers == 0 {
+                return Err(anyhow::anyhow!("No worker threads detected"));
+            }
+
+            // Create a barrier that all threads must reach
+            let barrier = Arc::new(std::sync::Barrier::new(num_workers));
+            let init_pool = Arc::clone(pool);
+            let init_permits = Arc::clone(permits);
+
+            // Spawn exactly one blocking task per worker thread
+            let mut handles = Vec::new();
+            for i in 0..num_workers {
+                let barrier_clone = Arc::clone(&barrier);
+                let pool_clone = Arc::clone(&init_pool);
+                let permits_clone = Arc::clone(&init_permits);
+
+                let handle = tokio::task::spawn_blocking(move || {
+                    // Wait at barrier - ensures all threads are participating
+                    barrier_clone.wait();
+
+                    // Now initialize thread-local storage
+                    crate::compute::thread_local::initialize_context(pool_clone, permits_clone);
+
+                    // Get thread ID for logging
+                    let thread_id = std::thread::current().id();
+                    tracing::trace!(
+                        "Initialized thread-local compute context on thread {:?} (worker {})",
+                        thread_id,
+                        i
+                    );
+                });
+                handles.push(handle);
+            }
+
+            // Wait for all tasks to complete
+            for handle in handles {
+                handle.await?;
+            }
+
+            tracing::info!(
+                "Successfully initialized thread-local compute context on {} worker threads",
+                num_workers
+            );
+        } else {
+            tracing::debug!("No compute pool configured, skipping thread-local initialization");
+        }
+        Ok(())
+    }
+
+    /// Detect the number of worker threads in the runtime
+    async fn detect_worker_thread_count(&self) -> usize {
+        use std::collections::HashSet;
+        use std::sync::Mutex;
+
+        let thread_ids = Arc::new(Mutex::new(HashSet::new()));
+        let mut handles = Vec::new();
+
+        // Spawn many blocking tasks to ensure we hit all threads
+        // We use spawn_blocking because it runs on worker threads
+        let num_probes = 100;
+        for _ in 0..num_probes {
+            let ids = Arc::clone(&thread_ids);
+            let handle = tokio::task::spawn_blocking(move || {
+                let thread_id = std::thread::current().id();
+                ids.lock().unwrap().insert(thread_id);
+            });
+            handles.push(handle);
+        }
+
+        // Wait for all probes to complete
+        for handle in handles {
+            let _ = handle.await;
+        }
+
+        let count = thread_ids.lock().unwrap().len();
+        tracing::debug!("Detected {} worker threads in runtime", count);
+        count
+    }
+
    pub fn from_current() -> Result<Runtime> {
        Runtime::from_handle(tokio::runtime::Handle::current())
    }
@@ -71,7 +224,7 @@ impl Runtime {
        let runtime = Arc::new(config.create_runtime()?);
        let primary = RuntimeType::Shared(runtime.clone());
        let secondary = RuntimeType::External(runtime.handle().clone());
-        Runtime::new(primary, Some(secondary))
+        Runtime::new_with_config(primary, Some(secondary), &config)
    }

    /// Create a [`Runtime`] with two single-threaded async tokio runtime
@@ -111,6 +264,13 @@ impl Runtime {
        self.graceful_shutdown_tracker.clone()
    }

+    /// Get access to the compute pool for CPU-intensive operations
+    ///
+    /// Returns None if the compute pool was not initialized (e.g., due to configuration error)
+    pub fn compute_pool(&self) -> Option<&Arc<crate::compute::ComputePool>> {
+        self.compute_pool.as_ref()
+    }
+
    /// Shuts down the [`Runtime`] instance
    pub fn shutdown(&self) {
        tracing::info!("Runtime shutdown initiated");